From b79a652d746bc186b0de559aa237462e7ba09109 Mon Sep 17 00:00:00 2001
From: Timothy Pearson <tpearson@raptorengineeringinc.com>
Date: Sat, 5 Sep 2015 17:55:58 -0500
Subject: [PATCH 010/143] northbridge/amd/amdmct: Fix broken AMD K10 DDR3
 memory initalization

Change-Id: Iab690db769e820600693ad1170085623b177b94e
Signed-off-by: Timothy Pearson <tpearson@raptorengineeringinc.com>
---
 src/northbridge/amd/amdfam10/raminit_amdmct.c   |    2 +
 src/northbridge/amd/amdmct/mct/mct_d.c          |    1 -
 src/northbridge/amd/amdmct/mct_ddr3/mct_d.c     |  177 ++++-
 src/northbridge/amd/amdmct/mct_ddr3/mct_d.h     |    8 +-
 src/northbridge/amd/amdmct/mct_ddr3/mct_d_gcc.h |   87 +--
 src/northbridge/amd/amdmct/mct_ddr3/mctardk6.c  |    6 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c  |  806 ++++++++++++-----------
 src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c    |    6 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctmtr_d.c  |   14 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctndi_d.c  |    3 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctproc.c   |   19 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c    |    5 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c    |  800 +++++++++++-----------
 src/northbridge/amd/amdmct/mct_ddr3/mctsrc1p.c  |   18 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctsrc2p.c  |   13 +-
 src/northbridge/amd/amdmct/mct_ddr3/mcttmrl.c   |    7 +-
 src/northbridge/amd/amdmct/mct_ddr3/mctwl.c     |   42 +-
 src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c   |  267 ++++----
 src/northbridge/amd/amdmct/wrappers/mcti_d.c    |  110 +---
 19 files changed, 1252 insertions(+), 1139 deletions(-)

diff --git a/src/northbridge/amd/amdfam10/raminit_amdmct.c b/src/northbridge/amd/amdfam10/raminit_amdmct.c
index a0d47f4..a585fae 100644
--- a/src/northbridge/amd/amdfam10/raminit_amdmct.c
+++ b/src/northbridge/amd/amdfam10/raminit_amdmct.c
@@ -28,12 +28,14 @@ static  void print_tx(const char *strval, u32 val)
 }
 #endif
 
+#if (CONFIG_DIMM_SUPPORT & 0x000F)!=0x0005 /* not needed for AMD_FAM10_DDR3 */
 static  void print_t(const char *strval)
 {
 #if CONFIG_DEBUG_RAM_SETUP
 	printk(BIOS_DEBUG, "%s", strval);
 #endif
 }
+#endif
 
 static  void print_tf(const char *func, const char *strval)
 {
diff --git a/src/northbridge/amd/amdmct/mct/mct_d.c b/src/northbridge/amd/amdmct/mct/mct_d.c
index 3dec934..88910e2 100644
--- a/src/northbridge/amd/amdmct/mct/mct_d.c
+++ b/src/northbridge/amd/amdmct/mct/mct_d.c
@@ -542,7 +542,6 @@ static void HTMemMapInit_D(struct MCTStatStruc *pMCTstat,
 		pDCTstat = pDCTstatA + Node;
 		devx = pDCTstat->dev_map;
 		DramSelBaseAddr = 0;
-		pDCTstat = pDCTstatA + Node;
 		if (!pDCTstat->GangedMode) {
 			DramSelBaseAddr = pDCTstat->NodeSysLimit - pDCTstat->DCTSysLimit;
 			/*In unganged mode, we must add DCT0 and DCT1 to DCTSysLimit */
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
index 71a6be8..fa59d71 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.c
@@ -214,6 +214,8 @@ static const u8 Table_DQSRcvEn_Offset[] = {0x00,0x01,0x10,0x11,0x2};
 static const u8 Tab_L1CLKDis[]  = {0x20, 0x20, 0x10, 0x10, 0x08, 0x08, 0x04, 0x04};
 static const u8 Tab_AM3CLKDis[] = {0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00};
 static const u8 Tab_S1CLKDis[]  = {0xA2, 0xA2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static const u8 Tab_C32CLKDis[] = {0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00};	/* Enable CS0 - CS3 clocks (DIMM0 - DIMM1) */
+static const u8 Tab_G34CLKDis[] = {0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00};	/* Enable CS0 - CS3 clocks (DIMM0 - DIMM1) */
 static const u8 Tab_ManualCLKDis[]= {0x10, 0x04, 0x08, 0x20, 0x00, 0x00, 0x00, 0x00};
 
 static const u8 Table_Comp_Rise_Slew_20x[] = {7, 3, 2, 2, 0xFF};
@@ -277,6 +279,11 @@ restartinit:
 	for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) {
 		struct DCTStatStruc *pDCTstat;
 		pDCTstat = pDCTstatA + Node;
+
+		/* Zero out data structures to avoid false detection of DIMMs */
+		memset(pDCTstat, 0, sizeof(struct DCTStatStruc));
+
+		/* Initialize data structures */
 		pDCTstat->Node_ID = Node;
 		pDCTstat->dev_host = PA_HOST(Node);
 		pDCTstat->dev_map = PA_MAP(Node);
@@ -284,17 +291,22 @@ restartinit:
 		pDCTstat->dev_nbmisc = PA_NBMISC(Node);
 		pDCTstat->NodeSysBase = node_sys_base;
 
+		printk(BIOS_DEBUG, "mctAutoInitMCT_D: mct_init Node %d\n", Node);
 		mct_init(pMCTstat, pDCTstat);
 		mctNodeIDDebugPort_D();
 		pDCTstat->NodePresent = NodePresent_D(Node);
 		if (pDCTstat->NodePresent) {		/* See if Node is there*/
+			printk(BIOS_DEBUG, "mctAutoInitMCT_D: clear_legacy_Mode\n");
 			clear_legacy_Mode(pMCTstat, pDCTstat);
 			pDCTstat->LogicalCPUID = mctGetLogicalCPUID_D(Node);
 
+			printk(BIOS_DEBUG, "mctAutoInitMCT_D: mct_InitialMCT_D\n");
 			mct_InitialMCT_D(pMCTstat, pDCTstat);
 
+			printk(BIOS_DEBUG, "mctAutoInitMCT_D: mctSMBhub_Init\n");
 			mctSMBhub_Init(Node);		/* Switch SMBUS crossbar to proper node*/
 
+			printk(BIOS_DEBUG, "mctAutoInitMCT_D: mct_initDCT\n");
 			mct_initDCT(pMCTstat, pDCTstat);
 			if (pDCTstat->ErrCode == SC_FatalErr) {
 				goto fatalexit;		/* any fatal errors?*/
@@ -345,6 +357,7 @@ restartinit:
 
 	mct_FinalMCT_D(pMCTstat, pDCTstatA);
 	printk(BIOS_DEBUG, "mctAutoInitMCT_D Done: Global Status: %x\n", pMCTstat->GStatus);
+
 	return;
 
 fatalexit:
@@ -560,7 +573,6 @@ static void HTMemMapInit_D(struct MCTStatStruc *pMCTstat,
 		pDCTstat = pDCTstatA + Node;
 		devx = pDCTstat->dev_map;
 		DramSelBaseAddr = 0;
-		pDCTstat = pDCTstatA + Node; /* ??? */
 		if (!pDCTstat->GangedMode) {
 			DramSelBaseAddr = pDCTstat->NodeSysLimit - pDCTstat->DCTSysLimit;
 			/*In unganged mode, we must add DCT0 and DCT1 to DCTSysLimit */
@@ -645,6 +657,7 @@ static void HTMemMapInit_D(struct MCTStatStruc *pMCTstat,
 		devx = pDCTstat->dev_map;
 
 		if (pDCTstat->NodePresent) {
+			printk(BIOS_DEBUG, " Copy dram map from Node 0 to Node %02x \n", Node);
 			reg = 0x40;		/*Dram Base 0*/
 			do {
 				val = Get_NB32(dev, reg);
@@ -1162,7 +1175,7 @@ static void SPD2ndTiming(struct MCTStatStruc *pMCTstat,
 
 	/* Program DRAM Timing values */
 	DramTimingLo = 0;	/* Dram Timing Low init */
-	val = pDCTstat->CASL - 2; /* pDCTstat.CASL to reg. definition */
+	val = pDCTstat->CASL - 4; /* pDCTstat.CASL to reg. definition */
 	DramTimingLo |= val;
 
 	val = pDCTstat->Trcd - Bias_TrcdT;
@@ -1406,18 +1419,16 @@ static void SPDGetTCL_D(struct MCTStatStruc *pMCTstat,
 	else if (tCKproposed16x <= 24) {
 		pDCTstat->TargetFreq = 6;
 		tCKproposed16x = 24;
-	}
-	else if (tCKproposed16x <= 30) {
+	} else if (tCKproposed16x <= 30) {
 		pDCTstat->TargetFreq = 5;
 		tCKproposed16x = 30;
-	}
-	else {
+	} else {
 		pDCTstat->TargetFreq = 4;
 		tCKproposed16x = 40;
 	}
 	/* Running through this loop twice:
 	   - First time find tCL at target frequency
-	   - Second tim find tCL at 400MHz */
+	   - Second time find tCL at 400MHz */
 
 	for (;;) {
 		CLT_Fail = 0;
@@ -1451,7 +1462,7 @@ static void SPDGetTCL_D(struct MCTStatStruc *pMCTstat,
 			CLT_Fail = 1;
 		/* get CL and T */
 		if (!CLT_Fail) {
-			bytex = CLactual - 2;
+			bytex = CLactual;
 			if (tCKproposed16x == 20)
 				byte = 7;
 			else if (tCKproposed16x == 24)
@@ -1632,7 +1643,7 @@ static u8 AutoConfig_D(struct MCTStatStruc *pMCTstat,
 		val = 0x0f; /* recommended setting (default) */
 	DramConfigHi |= val << 24;
 
-	if (pDCTstat->LogicalCPUID & (AMD_DR_Cx | AMD_DR_Bx))
+	if (pDCTstat->LogicalCPUID & (AMD_DR_Dx | AMD_DR_Cx | AMD_DR_Bx))
 		DramConfigHi |= 1 << DcqArbBypassEn;
 
 	/* Build MemClkDis Value from Dram Timing Lo and
@@ -1657,6 +1668,10 @@ static u8 AutoConfig_D(struct MCTStatStruc *pMCTstat,
 				p = Tab_L1CLKDis;
 			else if (byte == PT_M2 || byte == PT_AS)
 				p = Tab_AM3CLKDis;
+			else if (byte == PT_C3)
+				p = Tab_C32CLKDis;
+			else if (byte == PT_GR)
+				p = Tab_G34CLKDis;
 			else
 				p = Tab_S1CLKDis;
 
@@ -2102,8 +2117,7 @@ static u8 DIMMPresence_D(struct MCTStatStruc *pMCTstat,
 				if (byte == JED_RDIMM || byte == JED_MiniRDIMM) {
 					RegDIMMPresent |= 1 << i;
 					pDCTstat->DimmRegistered[i] = 1;
-				}
-				else {
+				} else {
 					pDCTstat->DimmRegistered[i] = 0;
 				}
 				/* Check ECC capable */
@@ -2977,9 +2991,9 @@ static void mct_FinalMCT_D(struct MCTStatStruc *pMCTstat,
 		} else {	/* For Dx CPU */
 			val = 0x0CE00F00 | 1 << 29/* FlushWrOnStpGnt */;
 			if (!(pDCTstat->GangedMode))
-				val |= 0x20; /* MctWrLimit =  8 for Unganed mode */
+				val |= 0x20; /* MctWrLimit =  8 for Unganged mode */
 			else
-				val |= 0x40; /* MctWrLimit =  16 for ganed mode */
+				val |= 0x40; /* MctWrLimit =  16 for ganged mode */
 			Set_NB32(pDCTstat->dev_dct, 0x11C, val);
 
 			val = Get_NB32(pDCTstat->dev_dct, 0x1B0);
@@ -3414,6 +3428,138 @@ static void mct_BeforeDramInit_Prod_D(struct MCTStatStruc *pMCTstat,
 			Set_NB32(dev,  0x98 + reg_off, 0x0D000030);
 			Set_NB32(dev,  0x9C + reg_off, dword);
 			Set_NB32(dev,  0x98 + reg_off, 0x4D040F30);
+
+			/* FIXME
+			 * Mainboards need to be able to specify the maximum number of DIMMs installable per channel
+			 * For now assume a maximum of 2 DIMMs per channel can be installed
+			 */
+			uint8_t MaxDimmsInstallable = 2;
+
+			/* Obtain number of DIMMs on channel */
+			uint8_t dimm_count = pDCTstat->MAdimms[i];
+			uint8_t rank_count_dimm0;
+			uint8_t rank_count_dimm1;
+			uint32_t odt_pattern_0;
+			uint32_t odt_pattern_1;
+			uint32_t odt_pattern_2;
+			uint32_t odt_pattern_3;
+
+			/* Select appropriate ODT pattern for installed DIMMs
+			 * Refer to the BKDG Rev. 3.62, page 120 onwards
+			 */
+			if (pDCTstat->C_DCTPtr[i]->Status[DCT_STATUS_REGISTERED]) {
+				if (MaxDimmsInstallable == 2) {
+					if (dimm_count == 1) {
+						/* 1 DIMM detected */
+						rank_count_dimm1 = pDCTstat->C_DCTPtr[i]->DimmRanks[1];
+						if (rank_count_dimm1 == 1) {
+							odt_pattern_0 = 0x00000000;
+							odt_pattern_1 = 0x00000000;
+							odt_pattern_2 = 0x00000000;
+							odt_pattern_3 = 0x00020000;
+						} else if (rank_count_dimm1 == 2) {
+							odt_pattern_0 = 0x00000000;
+							odt_pattern_1 = 0x00000000;
+							odt_pattern_2 = 0x00000000;
+							odt_pattern_3 = 0x02080000;
+						} else if (rank_count_dimm1 == 4) {
+							odt_pattern_0 = 0x00000000;
+							odt_pattern_1 = 0x00000000;
+							odt_pattern_2 = 0x020a0000;
+							odt_pattern_3 = 0x080a0000;
+						} else {
+							/* Fallback */
+							odt_pattern_0 = 0x00000000;
+							odt_pattern_1 = 0x00000000;
+							odt_pattern_2 = 0x00000000;
+							odt_pattern_3 = 0x00000000;
+						}
+					} else {
+						/* 2 DIMMs detected */
+						rank_count_dimm0 = pDCTstat->C_DCTPtr[i]->DimmRanks[0];
+						rank_count_dimm1 = pDCTstat->C_DCTPtr[i]->DimmRanks[1];
+						if ((rank_count_dimm0 < 4) && (rank_count_dimm1 < 4)) {
+							odt_pattern_0 = 0x00000000;
+							odt_pattern_1 = 0x01010202;
+							odt_pattern_2 = 0x00000000;
+							odt_pattern_3 = 0x09030603;
+						} else if ((rank_count_dimm0 < 4) && (rank_count_dimm1 == 4)) {
+							odt_pattern_0 = 0x01010000;
+							odt_pattern_1 = 0x01010a0a;
+							odt_pattern_2 = 0x01090000;
+							odt_pattern_3 = 0x01030e0b;
+						} else if ((rank_count_dimm0 == 4) && (rank_count_dimm1 < 4)) {
+							odt_pattern_0 = 0x00000202;
+							odt_pattern_1 = 0x05050202;
+							odt_pattern_2 = 0x00000206;
+							odt_pattern_3 = 0x0d070203;
+						} else if ((rank_count_dimm0 == 4) && (rank_count_dimm1 == 4)) {
+							odt_pattern_0 = 0x05050a0a;
+							odt_pattern_1 = 0x05050a0a;
+							odt_pattern_2 = 0x050d0a0e;
+							odt_pattern_3 = 0x05070a0b;
+						} else {
+							/* Fallback */
+							odt_pattern_0 = 0x00000000;
+							odt_pattern_1 = 0x00000000;
+							odt_pattern_2 = 0x00000000;
+							odt_pattern_3 = 0x00000000;
+						}
+					}
+				} else {
+					/* FIXME
+					 * 3 DIMMs per channel UNIMPLEMENTED
+					 */
+					odt_pattern_0 = 0x00000000;
+					odt_pattern_1 = 0x00000000;
+					odt_pattern_2 = 0x00000000;
+					odt_pattern_3 = 0x00000000;
+				}
+			} else {
+				if (MaxDimmsInstallable == 2) {
+					if (dimm_count == 1) {
+						/* 1 DIMM detected */
+						rank_count_dimm1 = pDCTstat->C_DCTPtr[i]->DimmRanks[1];
+						if (rank_count_dimm1 == 1) {
+							odt_pattern_0 = 0x00000000;
+							odt_pattern_1 = 0x00000000;
+							odt_pattern_2 = 0x00000000;
+							odt_pattern_3 = 0x00020000;
+						} else if (rank_count_dimm1 == 2) {
+							odt_pattern_0 = 0x00000000;
+							odt_pattern_1 = 0x00000000;
+							odt_pattern_2 = 0x00000000;
+							odt_pattern_3 = 0x02080000;
+						} else {
+							/* Fallback */
+							odt_pattern_0 = 0x00000000;
+							odt_pattern_1 = 0x00000000;
+							odt_pattern_2 = 0x00000000;
+							odt_pattern_3 = 0x00000000;
+						}
+					} else {
+						/* 2 DIMMs detected */
+						odt_pattern_0 = 0x00000000;
+						odt_pattern_1 = 0x01010202;
+						odt_pattern_2 = 0x00000000;
+						odt_pattern_3 = 0x09030603;
+					}
+				} else {
+					/* FIXME
+					 * 3 DIMMs per channel UNIMPLEMENTED
+					 */
+					odt_pattern_0 = 0x00000000;
+					odt_pattern_1 = 0x00000000;
+					odt_pattern_2 = 0x00000000;
+					odt_pattern_3 = 0x00000000;
+				}
+			}
+
+			/* Program ODT pattern */
+			Set_NB32_index_wait(dev, 0xf0 + reg_off, 0x180, odt_pattern_1);
+			Set_NB32_index_wait(dev, 0xf0 + reg_off, 0x181, odt_pattern_0);
+			Set_NB32_index_wait(dev, 0xf0 + reg_off, 0x182, odt_pattern_3);
+			Set_NB32_index_wait(dev, 0xf0 + reg_off, 0x183, odt_pattern_2);
 		}
 	}
 }
@@ -3657,6 +3803,7 @@ static void mct_BeforeDQSTrain_D(struct MCTStatStruc *pMCTstat,
 	}
 }
 
+/* Erratum 350 */
 static void mct_ResetDLL_D(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 dct)
 {
@@ -3692,11 +3839,11 @@ static void mct_ResetDLL_D(struct MCTStatStruc *pMCTstat,
 				mct_Read1LTestPattern_D(pMCTstat, pDCTstat, addr);	/* cache fills */
 
 				/* Write 0000_8000h to register F2x[1,0]9C_xD080F0C */
-				Set_NB32_index_wait(dev, 0x98 + reg_off, 0x4D080F0C, 0x00008000);
+				Set_NB32_index_wait(dev, 0x98 + reg_off, 0xD080F0C, 0x00008000);
 				mct_Wait(80); /* wait >= 300ns */
 
 				/* Write 0000_0000h to register F2x[1,0]9C_xD080F0C */
-				Set_NB32_index_wait(dev, 0x98 + reg_off, 0x4D080F0C, 0x00000000);
+				Set_NB32_index_wait(dev, 0x98 + reg_off, 0xD080F0C, 0x00000000);
 				mct_Wait(800); /* wait >= 2us */
 				break;
 			}
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
index e2d7aa8..219aa42 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mct_d.h
@@ -499,7 +499,7 @@ struct DCTStatStruc {		/* A per Node structure*/
 		/* CHB DIMM0 Byte 0 - 7  TxDqs */
 		/* CHB DIMM1 Byte 0 - 7  TxDqs */
 		/* CHB DIMM1 Byte 0 - 7  TxDqs */
-	u8 CH_D_B_RCVRDLY[2][4][8];	/* [A/B] [DIMM0-3] [DQS] */
+	u16 CH_D_B_RCVRDLY[2][4][8];	/* [A/B] [DIMM0-3] [DQS] */
 		/* CHA DIMM 0 Receiver Enable Delay*/
 		/* CHA DIMM 1 Receiver Enable Delay*/
 		/* CHA DIMM 2 Receiver Enable Delay*/
@@ -509,7 +509,7 @@ struct DCTStatStruc {		/* A per Node structure*/
 		/* CHB DIMM 1 Receiver Enable Delay*/
 		/* CHB DIMM 2 Receiver Enable Delay*/
 		/* CHB DIMM 3 Receiver Enable Delay*/
-	u8 CH_D_BC_RCVRDLY[2][4];
+	u16 CH_D_BC_RCVRDLY[2][4];
 		/* CHA DIMM 0 - 4 Check Byte Receiver Enable Delay*/
 		/* CHB DIMM 0 - 4 Check Byte Receiver Enable Delay*/
 	u8 DIMMValidDCT[2];	/* DIMM# in DCT0*/
@@ -769,7 +769,7 @@ u8 mct_checkNumberOfDqsRcvEn_1Pass(u8 pass);
 u32 SetupDqsPattern_1PassA(u8 Pass);
 u32 SetupDqsPattern_1PassB(u8 Pass);
 u8 mct_Get_Start_RcvrEnDly_1Pass(u8 Pass);
-u8 mct_Average_RcvrEnDly_Pass(struct DCTStatStruc *pDCTstat, u8 RcvrEnDly, u8 RcvrEnDlyLimit, u8 Channel, u8 Receiver, u8 Pass);
+u16 mct_Average_RcvrEnDly_Pass(struct DCTStatStruc *pDCTstat, u16 RcvrEnDly, u16 RcvrEnDlyLimit, u8 Channel, u8 Receiver, u8 Pass);
 void CPUMemTyping_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
 void UMAMemTyping_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
 u32 mctGetLogicalCPUID(u32 Node);
@@ -779,7 +779,7 @@ void mct_TrainDQSPos_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTs
 void mctSetEccDQSRcvrEn_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
 void TrainMaxReadLatency_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA);
 void mct_EndDQSTraining_D(struct MCTStatStruc *pMCTstat,struct DCTStatStruc *pDCTstatA);
-void mct_SetRcvrEnDly_D(struct DCTStatStruc *pDCTstat, u8 RcvrEnDly, u8 FinalValue, u8 Channel, u8 Receiver, u32 dev, u32 index_reg, u8 Addl_Index, u8 Pass);
+void mct_SetRcvrEnDly_D(struct DCTStatStruc *pDCTstat, u16 RcvrEnDly, u8 FinalValue, u8 Channel, u8 Receiver, u32 dev, u32 index_reg, u8 Addl_Index, u8 Pass);
 void SetEccDQSRcvrEn_D(struct DCTStatStruc *pDCTstat, u8 Channel);
 void mctGet_PS_Cfg_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, u32 dct);
 void InterleaveBanks_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, u8 dct);
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mct_d_gcc.h b/src/northbridge/amd/amdmct/mct_ddr3/mct_d_gcc.h
index 60f98bc..c40ea1a 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mct_d_gcc.h
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mct_d_gcc.h
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -103,10 +104,10 @@ static void proc_CLFLUSH(u32 addr_hi)
 
 	__asm__ volatile (
 		/* clflush fs:[eax] */
-		"outb %%al, $0xed\n\t"	/* _EXECFENCE */
-		 "clflush %%fs:(%0)\n\t"
+		"outb %%al, $0xed\n\t"  /* _EXECFENCE */
+		"clflush %%fs:(%0)\n\t"
 		"mfence\n\t"
-		 ::"a" (addr_hi<<8)
+		::"a" (addr_hi<<8)
 	);
 }
 
@@ -141,6 +142,24 @@ static u32 read32_fs(u32 addr_lo)
 	return value;
 }
 
+static uint64_t read64_fs(uint32_t addr_lo)
+{
+	uint64_t value = 0;
+	uint32_t value_lo;
+	uint32_t value_hi;
+
+	__asm__ volatile (
+		"outb %%al, $0xed\n\t"  /* _EXECFENCE */
+		"mfence\n\t"
+		"movl %%fs:(%2), %0\n\t"
+		"movl %%fs:(%3), %1\n\t"
+		:"=c"(value_lo), "=d"(value_hi): "a" (addr_lo), "b" (addr_lo + 4) : "memory"
+	);
+	value |= value_lo;
+	value |= ((uint64_t)value_hi) << 32;
+	return value;
+}
+
 #ifdef UNUSED_CODE
 static u8 read8_fs(u32 addr_lo)
 {
@@ -210,68 +229,6 @@ static __attribute__((noinline)) void FlushDQSTestPattern_L18(u32 addr_lo)
 	);
 }
 
-static void ReadL18TestPattern(u32 addr_lo)
-{
-	/* set fs and use fs prefix to access the mem */
-	__asm__ volatile (
-		"outb %%al, $0xed\n\t"			/* _EXECFENCE */
-		"movl %%fs:-128(%%esi), %%eax\n\t" 	/* TestAddr cache line */
-		"movl %%fs:-64(%%esi), %%eax\n\t"	/* +1 */
-		"movl %%fs:(%%esi), %%eax\n\t"		/* +2 */
-		"movl %%fs:64(%%esi), %%eax\n\t"	/* +3 */
-
-		"movl %%fs:-128(%%edi), %%eax\n\t"	/* +4 */
-		"movl %%fs:-64(%%edi), %%eax\n\t"	/* +5 */
-		"movl %%fs:(%%edi), %%eax\n\t"		/* +6 */
-		"movl %%fs:64(%%edi), %%eax\n\t"	/* +7 */
-
-		"movl %%fs:-128(%%ebx), %%eax\n\t"	/* +8 */
-		"movl %%fs:-64(%%ebx), %%eax\n\t"	/* +9 */
-		"movl %%fs:(%%ebx), %%eax\n\t"		/* +10 */
-		"movl %%fs:64(%%ebx), %%eax\n\t"	/* +11 */
-
-		"movl %%fs:-128(%%ecx), %%eax\n\t"	/* +12 */
-		"movl %%fs:-64(%%ecx), %%eax\n\t"	/* +13 */
-		"movl %%fs:(%%ecx), %%eax\n\t"		/* +14 */
-		"movl %%fs:64(%%ecx), %%eax\n\t"	/* +15 */
-
-		"movl %%fs:-128(%%edx), %%eax\n\t"	/* +16 */
-		"movl %%fs:-64(%%edx), %%eax\n\t"	/* +17 */
-		"mfence\n\t"
-
-		 :: "a"(0), "b" (addr_lo+128+8*64), "c" (addr_lo+128+12*64),
-		    "d" (addr_lo +128+16*64), "S"(addr_lo+128),
-		    "D"(addr_lo+128+4*64)
-	);
-
-}
-
-static void ReadL9TestPattern(u32 addr_lo)
-{
-
-	/* set fs and use fs prefix to access the mem */
-	__asm__ volatile (
-		"outb %%al, $0xed\n\t"			/* _EXECFENCE */
-
-		"movl %%fs:-128(%%ecx), %%eax\n\t"	/* TestAddr cache line */
-		"movl %%fs:-64(%%ecx), %%eax\n\t"	/* +1 */
-		"movl %%fs:(%%ecx), %%eax\n\t"		/* +2 */
-		"movl %%fs:64(%%ecx), %%eax\n\t"	/* +3 */
-
-		"movl %%fs:-128(%%edx), %%eax\n\t"	/* +4 */
-		"movl %%fs:-64(%%edx), %%eax\n\t"	/* +5 */
-		"movl %%fs:(%%edx), %%eax\n\t"		/* +6 */
-		"movl %%fs:64(%%edx), %%eax\n\t"	/* +7 */
-
-		"movl %%fs:-128(%%ebx), %%eax\n\t"	/* +8 */
-		"mfence\n\t"
-
-		 :: "a"(0), "b" (addr_lo+128+8*64), "c"(addr_lo+128),
-		    "d"(addr_lo+128+4*64)
-	);
-
-}
-
 static void ReadMaxRdLat1CLTestPattern_D(u32 addr)
 {
 	SetUpperFSbase(addr);
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctardk6.c b/src/northbridge/amd/amdmct/mct_ddr3/mctardk6.c
index ae1654c..99a2628 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctardk6.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctardk6.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -17,7 +18,7 @@
  * Foundation, Inc.
  */
 
-/* The socket type F (1207), Fr2, G (1207) are not tested.
+/* The socket type Fr2, G (1207) are not tested.
  */
 
 static void Get_ChannelPS_Cfg0_D(u8 MAAdimms, u8 Speed, u8 MAAload,
@@ -79,8 +80,7 @@ static void Get_ChannelPS_Cfg0_D( u8 MAAdimms, u8 Speed, u8 MAAload,
 			else
 				*AddrTmgCTL = 0x00353935;
 		}
-	}
-	else {
+	} else {
 		if(Speed == 4) {
 			*AddrTmgCTL = 0x00000000;
 			if (MAAdimms == 3)
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
index 404727b..cc2f43a 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctdqs_d.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,13 +23,6 @@ static void CalcEccDQSPos_D(struct MCTStatStruc *pMCTstat,
 				u8 scale, u8 ChipSel);
 static void GetDQSDatStrucVal_D(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat, u8 ChipSel);
-static u8 MiddleDQS_D(u8 min, u8 max);
-static void TrainReadDQS_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat,
-				u8 cs_start);
-static void TrainWriteDQS_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat,
-				u8 cs_start);
 static void WriteDQSTestPattern_D(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat,
 					u32 TestAddr_lo);
@@ -43,31 +37,19 @@ static void FlushDQSTestPattern_D(struct DCTStatStruc *pDCTstat,
 					u32 addr_lo);
 static void SetTargetWTIO_D(u32 TestAddr);
 static void ResetTargetWTIO_D(void);
-static void ReadDQSTestPattern_D(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat,
-					u32 TestAddr_lo);
-static void mctEngDQSwindow_Save_D(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat, u8 ChipSel,
-					u8 RnkDlyFilterMin, u8 RnkDlyFilterMax);
 void ResetDCTWrPtr_D(u32 dev, u32 index_reg, u32 index);
 u8 mct_DisableDimmEccEn_D(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat);
 static void mct_SetDQSDelayCSR_D(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat,
 					u8 ChipSel);
-static void mct_SetDQSDelayAllCSR_D(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat,
-					u8 cs_start);
 u32 mct_GetMCTSysAddr_D(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat, u8 Channel,
 				u8 receiver, u8 *valid);
 static void SetupDqsPattern_D(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat,
 				u32 *buffer);
-
-static void StoreWrRdDQSDatStrucVal_D(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat, u8 ChipSel,
-				      u8 RnkDlyFilterMin, u8 RnkDlyFilterMax);
+static void proc_IOCLFLUSH_D(u32 addr_hi);
 
 static void StoreDQSDatStrucVal_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, u8 ChipSel);
 
@@ -286,20 +268,99 @@ static void CalcEccDQSPos_D(struct MCTStatStruc *pMCTstat,
 	pDCTstat->DQSDelay = (u8)DQSDelay;
 }
 
+static void write_dqs_write_data_timing_registers(uint16_t* delay, uint32_t dev, uint8_t dimm, uint32_t index_reg)
+{
+	uint32_t dword;
+
+	/* Lanes 0 - 3 */
+	dword = Get_NB32_index_wait(dev, index_reg, 0x1 | (dimm << 8));
+	dword &= ~0x7f7f7f7f;
+	dword |= (delay[3] & 0x7f) << 24;
+	dword |= (delay[2] & 0x7f) << 16;
+	dword |= (delay[1] & 0x7f) << 8;
+	dword |= delay[0] & 0x7f;
+	Set_NB32_index_wait(dev, index_reg, 0x1 | (dimm << 8), dword);
+
+	/* Lanes 4 - 7 */
+	dword = Get_NB32_index_wait(dev, index_reg, 0x2 | (dimm << 8));
+	dword &= ~0x7f7f7f7f;
+	dword |= (delay[7] & 0x7f) << 24;
+	dword |= (delay[6] & 0x7f) << 16;
+	dword |= (delay[5] & 0x7f) << 8;
+	dword |= delay[4] & 0x7f;
+	Set_NB32_index_wait(dev, index_reg, 0x2 | (dimm << 8), dword);
+
+	/* Lane 8 (ECC) */
+	dword = Get_NB32_index_wait(dev, index_reg, 0x3 | (dimm << 8));
+	dword &= ~0x0000007f;
+	dword |= delay[8] & 0x7f;
+	Set_NB32_index_wait(dev, index_reg, 0x3 | (dimm << 8), dword);
+}
+
+static void write_dqs_read_data_timing_registers(uint16_t* delay, uint32_t dev, uint8_t dimm, uint32_t index_reg)
+{
+	uint32_t dword;
+
+	/* Lanes 0 - 3 */
+	dword = Get_NB32_index_wait(dev, index_reg, 0x5 | (dimm << 8));
+	dword &= ~0x3f3f3f3f;
+	dword |= (delay[3] & 0x3f) << 24;
+	dword |= (delay[2] & 0x3f) << 16;
+	dword |= (delay[1] & 0x3f) << 8;
+	dword |= delay[0] & 0x3f;
+	Set_NB32_index_wait(dev, index_reg, 0x5 | (dimm << 8), dword);
+
+	/* Lanes 4 - 7 */
+	dword = Get_NB32_index_wait(dev, index_reg, 0x6 | (dimm << 8));
+	dword &= ~0x3f3f3f3f;
+	dword |= (delay[7] & 0x3f) << 24;
+	dword |= (delay[6] & 0x3f) << 16;
+	dword |= (delay[5] & 0x3f) << 8;
+	dword |= delay[4] & 0x3f;
+	Set_NB32_index_wait(dev, index_reg, 0x6 | (dimm << 8), dword);
+
+	/* Lane 8 (ECC) */
+	dword = Get_NB32_index_wait(dev, index_reg, 0x7 | (dimm << 8));
+	dword &= ~0x0000003f;
+	dword |= delay[8] & 0x3f;
+	Set_NB32_index_wait(dev, index_reg, 0x7 | (dimm << 8), dword);
+}
+
+/* DQS Position Training
+ * Algorithm detailed in the Fam10h BKDG Rev. 3.62 section 2.8.9.9.3
+ */
 static void TrainDQSRdWrPos_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat,
-				u8 cs_start)
+				struct DCTStatStruc *pDCTstat)
 {
 	u32 Errors;
-	u8 Channel, DQSWrDelay;
+	u8 Channel;
+	u8 Receiver;
 	u8 _DisableDramECC = 0;
-	u32 PatternBuffer[292];
+	u32 PatternBuffer[304];	/* 288 + 16 */
 	u8 _Wrap32Dis = 0, _SSE2 = 0;
-	u8 dqsWrDelay_end;
 
+	u32 dev;
 	u32 addr;
+	u8 valid;
 	u32 cr4;
 	u32 lo, hi;
+	u32 index_reg;
+	uint32_t TestAddr;
+
+	uint8_t dual_rank;
+	uint8_t iter;
+	uint8_t lane;
+	uint16_t bytelane_test_results;
+	uint16_t current_write_dqs_delay[MAX_BYTE_LANES];
+	uint16_t current_read_dqs_delay[MAX_BYTE_LANES];
+	uint16_t write_dqs_delay_stepping_done[MAX_BYTE_LANES];
+	uint8_t dqs_read_results_array[2][MAX_BYTE_LANES][64];		/* [rank][lane][step] */
+	uint8_t dqs_write_results_array[2][MAX_BYTE_LANES][128];	/* [rank][lane][step] */
+
+	uint8_t last_pos = 0;
+	uint8_t cur_count = 0;
+	uint8_t best_pos = 0;
+	uint8_t best_count = 0;
 
 	print_debug_dqs("\nTrainDQSRdWrPos: Node_ID ", pDCTstat->Node_ID, 0);
 	cr4 = read_cr4();
@@ -323,50 +384,363 @@ static void TrainDQSRdWrPos_D(struct MCTStatStruc *pMCTstat,
 	SetupDqsPattern_D(pMCTstat, pDCTstat, PatternBuffer);
 
 	/* mct_BeforeTrainDQSRdWrPos_D */
-	dqsWrDelay_end = 0x20;
+
+	dev = pDCTstat->dev_dct;
+	pDCTstat->Direction = DQS_READDIR;
+
+	/* 2.8.9.9.3 (2)
+	 * Loop over each channel, lane, and rank
+	 */
+
+	/* NOTE
+	 * The BKDG originally stated to iterate over lane, then rank, however this process is quite slow
+	 * compared to an equivalent loop over rank, then lane as the latter allows multiple lanes to be
+	 * tested simultaneously, thus improving performance by around 8x.
+	 */
 
 	Errors = 0;
 	for (Channel = 0; Channel < 2; Channel++) {
-		print_debug_dqs("\tTrainDQSRdWrPos: 1 Channel ",Channel, 1);
+		print_debug_dqs("\tTrainDQSRdWrPos: 1 Channel ", Channel, 1);
 		pDCTstat->Channel = Channel;
 
 		if (pDCTstat->DIMMValidDCT[Channel] == 0)	/* mct_BeforeTrainDQSRdWrPos_D */
 			continue;
-		pDCTstat->DqsRdWrPos_Saved = 0;
-		for ( DQSWrDelay = 0; DQSWrDelay < dqsWrDelay_end; DQSWrDelay++) {
-			pDCTstat->DQSDelay = DQSWrDelay;
-			pDCTstat->Direction = DQS_WRITEDIR;
-			mct_SetDQSDelayAllCSR_D(pMCTstat, pDCTstat, cs_start);
-
-			print_debug_dqs("\t\tTrainDQSRdWrPos: 21 DQSWrDelay ", DQSWrDelay, 2);
-			TrainReadDQS_D(pMCTstat, pDCTstat, cs_start);
-			print_debug_dqs("\t\tTrainDQSRdWrPos: 21 DqsRdWrPos_Saved ", pDCTstat->DqsRdWrPos_Saved, 2);
-			if (pDCTstat->DqsRdWrPos_Saved == 0xFF)
-				break;
-
-			print_debug_dqs("\t\tTrainDQSRdWrPos: 22 TrainErrors ",pDCTstat->TrainErrors, 2);
-			if (pDCTstat->TrainErrors == 0) {
+
+		index_reg = 0x98 + 0x100 * Channel;
+
+		dual_rank = 0;
+		Receiver = mct_InitReceiver_D(pDCTstat, Channel);
+		/* There are four receiver pairs, loosely associated with chipselects.
+		* This is essentially looping over each rank of each DIMM.
+		*/
+		for (; Receiver < 8; Receiver++) {
+			if ((Receiver & 0x1) == 0) {
+				/* Even rank of DIMM */
+				if(mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, Channel, Receiver+1))
+					dual_rank = 1;
+				else
+					dual_rank = 0;
+			}
+
+			if (!mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, Channel, Receiver)) {
+				continue;
+			}
+
+			/* Select the base test address for the current rank */
+			TestAddr = mct_GetMCTSysAddr_D(pMCTstat, pDCTstat, Channel, Receiver, &valid);
+			if (!valid) {	/* Address not supported on current CS */
+				continue;
+			}
+
+			print_debug_dqs("\t\t\t\tTrainDQSRdWrPos: 14 TestAddr ", TestAddr, 4);
+			SetUpperFSbase(TestAddr);	/* fs:eax=far ptr to target */
+
+			print_debug_dqs("\t\t\t\tTrainDQSRdWrPos: 12 Receiver ", Receiver, 2);
+
+			/* 2.8.9.9.3 (DRAM Write Data Timing Loop)
+			 * Iterate over all possible DQS delay values (0x0 - 0x7f)
+			 */
+			uint8_t test_write_dqs_delay = 0;
+			uint8_t test_read_dqs_delay = 0;
+			uint8_t passing_dqs_delay_found[MAX_BYTE_LANES];
+
+			/* Initialize variables */
+			for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
+				current_write_dqs_delay[lane] = 0;
+				passing_dqs_delay_found[lane] = 0;
+				write_dqs_delay_stepping_done[lane] = 0;
+			}
+
+			for (test_write_dqs_delay = 0; test_write_dqs_delay < 128; test_write_dqs_delay++) {
+				print_debug_dqs("\t\t\t\tTrainDQSRdWrPos: 16 test_write_dqs_delay ", test_write_dqs_delay, 6);
+
+				/* Break out of loop if passing window already found, */
+				if (write_dqs_delay_stepping_done[0] && write_dqs_delay_stepping_done[1]
+					&& write_dqs_delay_stepping_done[2] && write_dqs_delay_stepping_done[3]
+					&& write_dqs_delay_stepping_done[4] && write_dqs_delay_stepping_done[5]
+					&& write_dqs_delay_stepping_done[6] && write_dqs_delay_stepping_done[7])
 					break;
+
+				/* Commit the current Write Data Timing settings to the hardware registers */
+				write_dqs_write_data_timing_registers(current_write_dqs_delay, dev, (Receiver >> 1), index_reg);
+
+				/* Write the DRAM training pattern to the base test address */
+				WriteDQSTestPattern_D(pMCTstat, pDCTstat, TestAddr << 8);
+
+				/* 2.8.9.9.3 (DRAM Read DQS Timing Control Loop)
+				 * Iterate over all possible DQS delay values (0x0 - 0x3f)
+				 */
+				for (test_read_dqs_delay = 0; test_read_dqs_delay < 64; test_read_dqs_delay++) {
+					print_debug_dqs("\t\t\t\t\tTrainDQSRdWrPos: 161 test_read_dqs_delay ", test_read_dqs_delay, 6);
+
+					/* Initialize Read DQS Timing Control settings for this iteration */
+					for (lane = 0; lane < MAX_BYTE_LANES; lane++)
+						if (!write_dqs_delay_stepping_done[lane])
+							current_read_dqs_delay[lane] = test_read_dqs_delay;
+
+					/* Commit the current Read DQS Timing Control settings to the hardware registers */
+					write_dqs_read_data_timing_registers(current_read_dqs_delay, dev, (Receiver >> 1), index_reg);
+
+					/* Initialize test result variable */
+					bytelane_test_results = 0xff;
+
+					/* Read the DRAM training pattern from the base test address three times
+					 * NOTE
+					 * While the BKDG states to read three times this is probably excessive!
+					 * Decrease training time by only reading the test pattern once per iteration
+					 */
+					for (iter = 0; iter < 1; iter++) {
+						/* Flush caches */
+						SetTargetWTIO_D(TestAddr);
+						FlushDQSTestPattern_D(pDCTstat, TestAddr << 8);
+						ResetTargetWTIO_D();
+
+						/* Read and compare pattern */
+						bytelane_test_results &= (CompareDQSTestPattern_D(pMCTstat, pDCTstat, TestAddr << 8) & 0xff); /* [Lane 7 :: Lane 0] 0=fail, 1=pass */
+
+						/* If all lanes have already failed testing bypass remaining re-read attempt(s) */
+						if (bytelane_test_results == 0x0)
+							break;
+					}
+
+					/* Store any lanes that passed testing for later use */
+					for (lane = 0; lane < 8; lane++)
+						if (!write_dqs_delay_stepping_done[lane])
+							dqs_read_results_array[Receiver & 0x1][lane][test_read_dqs_delay] = (!!(bytelane_test_results & (1 << lane)));
+
+					print_debug_dqs("\t\t\t\t\tTrainDQSRdWrPos: 162 bytelane_test_results ", bytelane_test_results, 6);
+				}
+
+				for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
+					if (write_dqs_delay_stepping_done[lane])
+						continue;
+
+					/* Determine location and length of longest consecutive string of passing values
+					 * Output is stored in best_pos and best_count
+					 */
+					last_pos = 0;
+					cur_count = 0;
+					best_pos = 0;
+					best_count = 0;
+					for (iter = 0; iter < 64; iter++) {
+						if ((dqs_read_results_array[Receiver & 0x1][lane][iter]) && (iter < 63)) {
+							/* Pass */
+							cur_count++;
+						} else {
+							/* Failure or end of loop */
+							if (cur_count > best_count) {
+								best_count = cur_count;
+								best_pos = last_pos;
+							}
+							cur_count = 0;
+							last_pos = iter;
+						}
+					}
+
+					if (best_count > 2) {
+						/* Exit the DRAM Write Data Timing Loop after programming the Read DQS Timing Control
+						 * register with the center of the passing window
+						 */
+						current_read_dqs_delay[lane] = (best_pos + (best_count / 2));
+						passing_dqs_delay_found[lane] = 1;
+
+						/* Commit the current Read DQS Timing Control settings to the hardware registers */
+						write_dqs_read_data_timing_registers(current_read_dqs_delay, dev, (Receiver >> 1), index_reg);
+
+						/* Exit the DRAM Write Data Timing Loop */
+						write_dqs_delay_stepping_done[lane] = 1;
+
+						print_debug_dqs("\t\t\t\tTrainDQSRdWrPos: 142 largest passing region ", best_count, 4);
+						print_debug_dqs("\t\t\t\tTrainDQSRdWrPos: 143 largest passing region start ", best_pos, 4);
+					}
+
+					/* Increment the DQS Write Delay value if needed for the next DRAM Write Data Timing Loop iteration */
+					if (!write_dqs_delay_stepping_done[lane])
+						current_write_dqs_delay[lane]++;
+				}
 			}
-			Errors |= pDCTstat->TrainErrors;
-		}
 
-		pDCTstat->DqsRdWrPos_Saved = 0;
-		if (DQSWrDelay < dqsWrDelay_end) {
-			Errors = 0;
+			/* Flag failure(s) if present */
+			for (lane = 0; lane < 8; lane++) {
+				if (!passing_dqs_delay_found[lane]) {
+					print_debug_dqs("\t\t\t\tTrainDQSRdWrPos: 121 Unable to find passing region for lane ", lane, 2);
+
+					/* Flag absence of passing window */
+					Errors |= 1 << SB_NODQSPOS;
+				}
+			}
+
+			/* Iterate over all possible Write Data Timing values (0x0 - 0x7f)
+			 * Note that the Read DQS Timing Control was calibrated / centered in the prior nested loop
+			 */
+			for (test_write_dqs_delay = 0; test_write_dqs_delay < 128; test_write_dqs_delay++) {
+				/* Initialize Write Data Timing settings for this iteration */
+				for (lane = 0; lane < MAX_BYTE_LANES; lane++)
+					current_write_dqs_delay[lane] = test_write_dqs_delay;
+
+				/* Commit the current Write Data Timing settings to the hardware registers */
+				write_dqs_write_data_timing_registers(current_write_dqs_delay, dev, (Receiver >> 1), index_reg);
+
+				/* Write the DRAM training pattern to the base test address */
+				WriteDQSTestPattern_D(pMCTstat, pDCTstat, TestAddr << 8);
+
+				/* Flush caches */
+				SetTargetWTIO_D(TestAddr);
+				FlushDQSTestPattern_D(pDCTstat, TestAddr << 8);
+				ResetTargetWTIO_D();
+
+				/* Read and compare pattern from the base test address */
+				bytelane_test_results = (CompareDQSTestPattern_D(pMCTstat, pDCTstat, TestAddr << 8) & 0xff); /* [Lane 7 :: Lane 0] 0=fail, 1=pass */
+
+				/* Store any lanes that passed testing for later use */
+				for (lane = 0; lane < 8; lane++)
+					dqs_write_results_array[Receiver & 0x1][lane][test_write_dqs_delay] = (!!(bytelane_test_results & (1 << lane)));
+			}
+
+			for (lane = 0; lane < 8; lane++) {
+				if ((!dual_rank) || (dual_rank && (Receiver & 0x1))) {
+
+#ifdef PRINT_PASS_FAIL_BITMAPS
+					for (iter = 0; iter < 64; iter++) {
+						if (dqs_read_results_array[0][lane][iter])
+							printk(BIOS_DEBUG, "+");
+						else
+							printk(BIOS_DEBUG, ".");
+					}
+					printk(BIOS_DEBUG, "\n");
+					for (iter = 0; iter < 64; iter++) {
+						if (dqs_read_results_array[1][lane][iter])
+							printk(BIOS_DEBUG, "+");
+						else
+							printk(BIOS_DEBUG, ".");
+					}
+					printk(BIOS_DEBUG, "\n\n");
+					for (iter = 0; iter < 128; iter++) {
+						if (dqs_write_results_array[0][lane][iter])
+							printk(BIOS_DEBUG, "+");
+						else
+							printk(BIOS_DEBUG, ".");
+					}
+					printk(BIOS_DEBUG, "\n");
+					for (iter = 0; iter < 128; iter++) {
+						if (dqs_write_results_array[1][lane][iter])
+							printk(BIOS_DEBUG, "+");
+						else
+							printk(BIOS_DEBUG, ".");
+					}
+					printk(BIOS_DEBUG, "\n\n");
+#endif
+
+					/* Base rank of single-rank DIMM, or odd rank of dual-rank DIMM */
+					if (dual_rank) {
+						/* Intersect the passing windows of both ranks */
+						for (iter = 0; iter < 64; iter++)
+							if (!dqs_read_results_array[1][lane][iter])
+								dqs_read_results_array[0][lane][iter] = 0;
+						for (iter = 0; iter < 128; iter++)
+							if (!dqs_write_results_array[1][lane][iter])
+								dqs_write_results_array[0][lane][iter] = 0;
+					}
+
+					/* Determine location and length of longest consecutive string of passing values for read DQS timing
+					 * Output is stored in best_pos and best_count
+					 */
+					last_pos = 0;
+					cur_count = 0;
+					best_pos = 0;
+					best_count = 0;
+					for (iter = 0; iter < 64; iter++) {
+						if ((dqs_read_results_array[0][lane][iter]) && (iter < 63)) {
+							/* Pass */
+							cur_count++;
+						} else {
+							/* Failure or end of loop */
+							if (cur_count > best_count) {
+								best_count = cur_count;
+								best_pos = last_pos;
+							}
+							cur_count = 0;
+							last_pos = iter;
+						}
+					}
+					print_debug_dqs("\t\t\t\tTrainDQSRdWrPos: 144 largest read passing region ", best_count, 4);
+					if (best_count > 0) {
+						if (best_count < MIN_DQS_WNDW) {
+							/* Flag excessively small passing window */
+							Errors |= 1 << SB_SMALLDQS;
+						}
+
+						/* Find the center of the passing window */
+						current_read_dqs_delay[lane] = (best_pos + (best_count / 2));
+
+						/* Commit the current Read DQS Timing Control settings to the hardware registers */
+						write_dqs_read_data_timing_registers(current_read_dqs_delay, dev, (Receiver >> 1), index_reg);
+
+						/* Save the final Read DQS Timing Control settings for later use */
+						pDCTstat->CH_D_DIR_B_DQS[Channel][Receiver >> 1][DQS_READDIR][lane] = current_read_dqs_delay[lane];
+					} else {
+						print_debug_dqs("\t\t\t\tTrainDQSRdWrPos: 122 Unable to find read passing region for lane ", lane, 2);
+
+						/* Flag absence of passing window */
+						Errors |= 1 << SB_NODQSPOS;
+					}
+
+					/* Determine location and length of longest consecutive string of passing values for write DQS timing
+					 * Output is stored in best_pos and best_count
+					 */
+					last_pos = 0;
+					cur_count = 0;
+					best_pos = 0;
+					best_count = 0;
+					for (iter = 0; iter < 128; iter++) {
+						if ((dqs_write_results_array[0][lane][iter]) && (iter < 127)) {
+							/* Pass */
+							cur_count++;
+						} else {
+							/* Failure or end of loop */
+							if (cur_count > best_count) {
+								best_count = cur_count;
+								best_pos = last_pos;
+							}
+							cur_count = 0;
+							last_pos = iter;
+						}
+					}
+					print_debug_dqs("\t\t\t\tTrainDQSRdWrPos: 145 largest write passing region ", best_count, 4);
+					if (best_count > 0) {
+						if (best_count < MIN_DQS_WNDW) {
+							/* Flag excessively small passing window */
+							Errors |= 1 << SB_SMALLDQS;
+						}
+
+						/* Find the center of the passing window */
+						current_write_dqs_delay[lane] = (best_pos + (best_count / 2));
+
+						/* Commit the current Write Data Timing settings to the hardware registers */
+						write_dqs_write_data_timing_registers(current_write_dqs_delay, dev, (Receiver >> 1), index_reg);
+
+						/* Save the final Write Data Timing settings for later use */
+						pDCTstat->CH_D_DIR_B_DQS[Channel][Receiver >> 1][DQS_WRITEDIR][lane] = current_write_dqs_delay[lane];
+					} else {
+						print_debug_dqs("\t\t\t\tTrainDQSRdWrPos: 123 Unable to find write passing region for lane ", lane, 2);
+
+						/* Flag absence of passing window */
+						Errors |= 1 << SB_NODQSPOS;
+					}
+				}
+			}
 
-			print_debug_dqs("\tTrainDQSRdWrPos: 231 DQSWrDelay ", DQSWrDelay, 1);
-			TrainWriteDQS_D(pMCTstat, pDCTstat, cs_start);
 		}
-		print_debug_dqs("\tTrainDQSRdWrPos: 232 Errors ", Errors, 1);
-		pDCTstat->ErrStatus |= Errors;
 	}
 
+	pDCTstat->TrainErrors |= Errors;
+	pDCTstat->ErrStatus |= Errors;
+
 #if DQS_TRAIN_DEBUG > 0
 	{
 		u8 val;
 		u8 i;
-		u8 Channel, Receiver, Dir;
+		u8 ChannelDTD, ReceiverDTD, Dir;
 		u8 *p;
 
 		for (Dir = 0; Dir < 2; Dir++) {
@@ -375,14 +749,14 @@ static void TrainDQSRdWrPos_D(struct MCTStatStruc *pMCTstat,
 			} else {
 				printk(BIOS_DEBUG, "TrainDQSRdWrPos: CH_D_DIR_B_DQS RD:\n");
 			}
-			for (Channel = 0; Channel < 2; Channel++) {
-				printk(BIOS_DEBUG, "Channel: %02x\n", Channel);
-				for (Receiver = cs_start; Receiver < (cs_start + 2); Receiver += 2) {
-					printk(BIOS_DEBUG, "\t\tReceiver: %02x: ", Receiver);
-					p = pDCTstat->CH_D_DIR_B_DQS[Channel][Receiver >> 1][Dir];
+			for (ChannelDTD = 0; ChannelDTD < 2; ChannelDTD++) {
+				printk(BIOS_DEBUG, "Channel: %02x\n", ChannelDTD);
+				for (ReceiverDTD = 0; ReceiverDTD < MAX_CS_SUPPORTED; ReceiverDTD += 2) {
+					printk(BIOS_DEBUG, "\t\tReceiver: %02x:", ReceiverDTD);
+					p = pDCTstat->CH_D_DIR_B_DQS[ChannelDTD][ReceiverDTD >> 1][Dir];
 					for (i=0;i<8; i++) {
 						val  = p[i];
-						printk(BIOS_DEBUG, "%02x ", val);
+						printk(BIOS_DEBUG, " %02x", val);
 					}
 					printk(BIOS_DEBUG, "\n");
 				}
@@ -437,225 +811,6 @@ static void SetupDqsPattern_D(struct MCTStatStruc *pMCTstat,
 	pDCTstat->PtrPatternBufA = (u32)buf;
 }
 
-static void TrainDQSPos_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat,
-				u8 cs_start)
-{
-	u32 Errors;
-	u8 ChipSel, DQSDelay;
-	u8 RnkDlySeqPassMin=0, RnkDlySeqPassMax=0xFF, RnkDlyFilterMin=0, RnkDlyFilterMax=0xFF;
-	u8 RnkDlySeqPassMinTot=0, RnkDlySeqPassMaxTot=0xFF, RnkDlyFilterMinTot=0, RnkDlyFilterMaxTot=0xFF;
-	u8 LastTest ,LastTestTot;
-	u32 TestAddr;
-	u8 ByteLane;
-	u8 MutualCSPassW[128];
-	u8 BanksPresent;
-	u8 dqsDelay_end;
-	u8 tmp, valid, tmp1;
-	u16 word;
-
-	/* MutualCSPassW: each byte represents a bitmap of pass/fail per
-	 * ByteLane.  The indext within MutualCSPassW is the delay value
-	 * given the results.
-	 */
-	print_debug_dqs("\t\t\tTrainDQSPos begin ", 0, 3);
-
-	Errors = 0;
-	BanksPresent = 0;
-
-	dqsDelay_end = 32;
-	/* Bitmapped status per delay setting, 0xff=All positions
-	 * passing (1= PASS). Set the entire array.
-	 */
-	for (DQSDelay=0; DQSDelay<128; DQSDelay++) {
-		MutualCSPassW[DQSDelay] = 0xFF;
-	}
-
-	for (ChipSel = cs_start; ChipSel < (cs_start + 2); ChipSel++) { /* logical register chipselects 0..7 */
-		print_debug_dqs("\t\t\t\tTrainDQSPos: 11 ChipSel ", ChipSel, 4);
-
-		if (!mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, pDCTstat->Channel, ChipSel)) {
-			print_debug_dqs("\t\t\t\tmct_RcvrRankEnabled_D CS not enabled ", ChipSel, 4);
-			continue;
-		}
-
-		BanksPresent = 1; 	/* flag for at least one bank is present */
-		TestAddr = mct_GetMCTSysAddr_D(pMCTstat, pDCTstat, pDCTstat->Channel, ChipSel, &valid);
-		if (!valid) {
-			print_debug_dqs("\t\t\t\tAddress not supported on current CS ", TestAddr, 4);
-			continue;
-		}
-
-		print_debug_dqs("\t\t\t\tTrainDQSPos: 12 TestAddr ", TestAddr, 4);
-		SetUpperFSbase(TestAddr);	/* fs:eax=far ptr to target */
-
-		if (pDCTstat->Direction == DQS_READDIR) {
-			print_debug_dqs("\t\t\t\tTrainDQSPos: 13 for read ", 0, 4);
-			WriteDQSTestPattern_D(pMCTstat, pDCTstat, TestAddr << 8);
-		}
-
-		for (DQSDelay = 0; DQSDelay < dqsDelay_end; DQSDelay++) {
-			print_debug_dqs("\t\t\t\t\tTrainDQSPos: 141 DQSDelay ", DQSDelay, 5);
-
-			tmp = 0xFF;
-			tmp1 = DQSDelay;
-			if (pDCTstat->Direction == DQS_READDIR) {
-				tmp &= MutualCSPassW[DQSDelay];
-				tmp1 += dqsDelay_end;
-			}
-			tmp &= MutualCSPassW[tmp1];
-
-			if (tmp == 0) {
-				continue;/* skip current delay value if other chipselects have failed all 8 bytelanes */
-			}
-
-			pDCTstat->DQSDelay = DQSDelay;
-			mct_SetDQSDelayAllCSR_D(pMCTstat, pDCTstat, cs_start);
-			print_debug_dqs("\t\t\t\t\tTrainDQSPos: 142 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
-
-			if (pDCTstat->Direction == DQS_WRITEDIR) {
-				print_debug_dqs("\t\t\t\t\tTrainDQSPos: 143 for write", 0, 5);
-				WriteDQSTestPattern_D(pMCTstat, pDCTstat, TestAddr << 8);
-			}
-
-			print_debug_dqs("\t\t\t\t\tTrainDQSPos: 144 Pattern ", pDCTstat->Pattern, 5);
-			ReadDQSTestPattern_D(pMCTstat, pDCTstat, TestAddr << 8);
-			/* print_debug_dqs("\t\t\t\t\tTrainDQSPos: 145 MutualCSPassW ", MutualCSPassW[DQSDelay], 5); */
-			word = CompareDQSTestPattern_D(pMCTstat, pDCTstat, TestAddr << 8); /* 0=fail, 1=pass */
-			print_debug_dqs("\t\t\t\t\tTrainDQSPos: 144 compare 1 ", word, 3);
-
-			print_debug_dqs("\t\t\t\t\tTrainDQSPos: 144 DqsRdWrPos_Saved ", pDCTstat->DqsRdWrPos_Saved, 3);
-			word &= ~(pDCTstat->DqsRdWrPos_Saved); /* mask out bytelanes that already passed */
-			word &= ~(pDCTstat->DqsRdWrPos_Saved << 8);
-			print_debug_dqs("\t\t\t\t\tTrainDQSPos: 144 compare 2 ", word, 3);
-
-			tmp = DQSDelay;
-			if (pDCTstat->Direction == DQS_READDIR) {
-				MutualCSPassW[tmp] &= word >> 8;
-				tmp += dqsDelay_end;
-			}
-			MutualCSPassW[tmp] &= word & 0xFF;
-
-			print_debug_dqs("\t\t\t\t\tTrainDQSPos: 146 \tMutualCSPassW ", MutualCSPassW[DQSDelay], 5);
-
-			SetTargetWTIO_D(TestAddr);
-			FlushDQSTestPattern_D(pDCTstat, TestAddr << 8);
-			ResetTargetWTIO_D();
-		}
-
-	}
-
-	if (pDCTstat->Direction == DQS_READDIR) {
-		dqsDelay_end <<= 1;
-	}
-
-	if (BanksPresent) {
-		#if 0		/* show the bitmap */
-		for (ByteLane = 0; ByteLane < 8; ByteLane++) { /* just print ByteLane 0 */
-			for (DQSDelay = 0; DQSDelay < dqsDelay_end; DQSDelay++) {
-				if (!(MutualCSPassW[DQSDelay] &(1 << ByteLane))) {
-					printk(BIOS_DEBUG, ".");
-				} else {
-					printk(BIOS_DEBUG, "*");
-				}
-			}
-			printk(BIOS_DEBUG, "\n");
-		}
-		#endif
-		for (ByteLane = 0; ByteLane < 8; ByteLane++) {
-			print_debug_dqs("\t\t\t\tTrainDQSPos: 31 ByteLane ",ByteLane, 4);
-			if (!(pDCTstat->DqsRdWrPos_Saved &(1 << ByteLane))) {
-				pDCTstat->ByteLane = ByteLane;
-				LastTest = DQS_FAIL;		/* Analyze the results */
-				LastTestTot = DQS_FAIL;
-				/* RnkDlySeqPassMin = 0; */
-				/* RnkDlySeqPassMax = 0; */
-				RnkDlyFilterMax = 0;
-				RnkDlyFilterMin = 0;
-				RnkDlyFilterMaxTot = 0;
-				RnkDlyFilterMinTot = 0;
-				for (DQSDelay = 0; DQSDelay < dqsDelay_end; DQSDelay++) {
-					if (MutualCSPassW[DQSDelay] & (1 << ByteLane)) {
-						print_debug_dqs("\t\t\t\t\tTrainDQSPos: 321 DQSDelay ", DQSDelay, 5);
-						print_debug_dqs("\t\t\t\t\tTrainDQSPos: 322 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
-						if (pDCTstat->Direction == DQS_READDIR)
-							tmp = 0x20;
-						else
-							tmp = 0;
-						if (DQSDelay >= tmp) {
-							RnkDlySeqPassMax = DQSDelay;
-							if (LastTest == DQS_FAIL) {
-								RnkDlySeqPassMin = DQSDelay; /* start sequential run */
-							}
-							if ((RnkDlySeqPassMax - RnkDlySeqPassMin)>(RnkDlyFilterMax-RnkDlyFilterMin)){
-								RnkDlyFilterMin = RnkDlySeqPassMin;
-								RnkDlyFilterMax = RnkDlySeqPassMax;
-							}
-							LastTest = DQS_PASS;
-						}
-
-						if (pDCTstat->Direction == DQS_READDIR) {
-							RnkDlySeqPassMaxTot = DQSDelay;
-							if (LastTestTot == DQS_FAIL)
-								RnkDlySeqPassMinTot = DQSDelay;
-							if ((RnkDlySeqPassMaxTot - RnkDlySeqPassMinTot)>(RnkDlyFilterMaxTot-RnkDlyFilterMinTot)){
-								RnkDlyFilterMinTot = RnkDlySeqPassMinTot;
-								RnkDlyFilterMaxTot = RnkDlySeqPassMaxTot;
-							}
-							LastTestTot = DQS_PASS;
-						}
-					} else {
-						LastTest = DQS_FAIL;
-						LastTestTot = DQS_FAIL;
-					}
-				}
-				print_debug_dqs("\t\t\t\tTrainDQSPos: 33 RnkDlySeqPassMax ", RnkDlySeqPassMax, 4);
-				if (RnkDlySeqPassMax == 0) {
-					Errors |= 1 << SB_NODQSPOS; /* no passing window */
-				} else {
-					print_debug_dqs_pair("\t\t\t\tTrainDQSPos: 34 RnkDlyFilter: ", RnkDlyFilterMin, " ",  RnkDlyFilterMax, 4);
-					if (((RnkDlyFilterMax - RnkDlyFilterMin) < MIN_DQS_WNDW)){
-						Errors |= 1 << SB_SMALLDQS;
-					} else {
-						u8 middle_dqs;
-						/* mctEngDQSwindow_Save_D Not required for arrays */
-						if (pDCTstat->Direction == DQS_READDIR)
-							middle_dqs = MiddleDQS_D(RnkDlyFilterMinTot, RnkDlyFilterMaxTot);
-						else
-							middle_dqs = MiddleDQS_D(RnkDlyFilterMin, RnkDlyFilterMax);
-						pDCTstat->DQSDelay = middle_dqs;
-						mct_SetDQSDelayCSR_D(pMCTstat, pDCTstat, cs_start);  /* load the register with the value */
-						if (pDCTstat->Direction == DQS_READDIR)
-							StoreWrRdDQSDatStrucVal_D(pMCTstat, pDCTstat, cs_start, RnkDlyFilterMinTot, RnkDlyFilterMaxTot); /* store the value into the data structure */
-						else
-							StoreWrRdDQSDatStrucVal_D(pMCTstat, pDCTstat, cs_start, RnkDlyFilterMin, RnkDlyFilterMax); /* store the value into the data structure */
-						print_debug_dqs("\t\t\t\tTrainDQSPos: 42 middle_dqs : ",middle_dqs, 4);
-						pDCTstat->DqsRdWrPos_Saved |= 1 << ByteLane;
-					}
-				}
-			}
-		} /* if (pDCTstat->DqsRdWrPos_Saved &(1 << ByteLane)) */
-	}
-/* skipLocMiddle: */
-	pDCTstat->TrainErrors = Errors;
-
-	print_debug_dqs("\t\t\tTrainDQSPos: Errors ", Errors, 3);
-}
-
-static void mctEngDQSwindow_Save_D(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat, u8 ChipSel,
-					u8 RnkDlyFilterMin, u8 RnkDlyFilterMax)
-{
-	pDCTstat->CH_D_DIR_MaxMin_B_Dly[pDCTstat->Channel]
-		[pDCTstat->Direction]
-		[0]
-		[pDCTstat->ByteLane] = RnkDlyFilterMin;
-	pDCTstat->CH_D_DIR_MaxMin_B_Dly[pDCTstat->Channel]
-		[pDCTstat->Direction]
-		[1]
-		[pDCTstat->ByteLane] = RnkDlyFilterMax;
-}
-
 static void StoreDQSDatStrucVal_D(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat, u8 ChipSel)
 {
@@ -679,26 +834,6 @@ static void StoreDQSDatStrucVal_D(struct MCTStatStruc *pMCTstat,
 					pDCTstat->DQSDelay;
 }
 
-static void StoreWrRdDQSDatStrucVal_D(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat, u8 ChipSel,
-					u8 RnkDlyFilterMin, u8 RnkDlyFilterMax)
-{
-	u8 dn;
-
-	if (pDCTstat->Direction == DQS_WRITEDIR) {
-		dn = ChipSel >> 1;
-		RnkDlyFilterMin += pDCTstat->CH_D_B_TxDqs[pDCTstat->Channel][dn][pDCTstat->ByteLane];
-		RnkDlyFilterMax += pDCTstat->CH_D_B_TxDqs[pDCTstat->Channel][dn][pDCTstat->ByteLane];
-		pDCTstat->DQSDelay += pDCTstat->CH_D_B_TxDqs[pDCTstat->Channel][dn][pDCTstat->ByteLane];
-	} else {
-		RnkDlyFilterMin <<= 1;
-		RnkDlyFilterMax <<= 1;
-		pDCTstat->DQSDelay <<= 1;
-	}
-	mctEngDQSwindow_Save_D(pMCTstat, pDCTstat, ChipSel, RnkDlyFilterMin, RnkDlyFilterMax);
-	StoreDQSDatStrucVal_D(pMCTstat, pDCTstat, ChipSel);
-}
-
 static void GetDQSDatStrucVal_D(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat, u8 ChipSel)
 {
@@ -720,33 +855,6 @@ static void GetDQSDatStrucVal_D(struct MCTStatStruc *pMCTstat,
 
 /* FindDQSDatDimmVal_D is not required since we use an array */
 
-static u8 MiddleDQS_D(u8 min, u8 max)
-{
-	u8 size;
-	size = max-min;
-	if (size % 2)
-		size++;		/* round up if the size isn't even. */
-	return ( min + (size >> 1));
-}
-
-static void TrainReadDQS_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat,
-				u8 cs_start)
-{
-	print_debug_dqs("\t\tTrainReadPos ", 0, 2);
-	pDCTstat->Direction = DQS_READDIR;
-	TrainDQSPos_D(pMCTstat, pDCTstat, cs_start);
-}
-
-static void TrainWriteDQS_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat,
-				u8 cs_start)
-{
-	pDCTstat->Direction = DQS_WRITEDIR;
-	print_debug_dqs("\t\tTrainWritePos", 0, 2);
-	TrainDQSPos_D(pMCTstat, pDCTstat, cs_start);
-}
-
 static void proc_IOCLFLUSH_D(u32 addr_hi)
 {
 	SetTargetWTIO_D(addr_hi);
@@ -963,30 +1071,6 @@ static void ResetTargetWTIO_D(void)
 	_WRMSR(0xc0010017, lo, hi); /* IORR0 Mask */
 }
 
-static void ReadDQSTestPattern_D(struct MCTStatStruc *pMCTstat,
-				struct DCTStatStruc *pDCTstat,
-				u32 TestAddr_lo)
-{
-	/* Read a pattern of 72 bit times (per DQ), to test dram functionality.
-	 * The pattern is a stress pattern which exercises both ISI and
-	 * crosstalk.  The number of cache lines to fill is dependent on DCT
-	 * width mode and burstlength.
-	 * Mode BL  Lines Pattern no.
-	 * ----+---+-------------------
-	 * 64	4	  9	0
-	 * 64	8	  9	0
-	 * 64M	4	  9	0
-	 * 64M	8	  9	0
-	 * 128	4	  18	1
-	 * 128	8	  N/A	-
-	 */
-	if (pDCTstat->Pattern == 0)
-		ReadL9TestPattern(TestAddr_lo);
-	else
-		ReadL18TestPattern(TestAddr_lo);
-	_MFENCE;
-}
-
 u32 SetUpperFSbase(u32 addr_hi)
 {
 	/* Set the upper 32-bits of the Base address, 4GB aligned) for the
@@ -1009,8 +1093,6 @@ void ResetDCTWrPtr_D(u32 dev, u32 index_reg, u32 index)
 	Set_NB32_index_wait(dev, index_reg, index, val);
 }
 
-/* mctEngDQSwindow_Save_D not required with arrays */
-
 void mct_TrainDQSPos_D(struct MCTStatStruc *pMCTstat,
 			struct DCTStatStruc *pDCTstatA)
 {
@@ -1021,8 +1103,8 @@ void mct_TrainDQSPos_D(struct MCTStatStruc *pMCTstat,
 	for (Node = 0; Node < MAX_NODES_SUPPORTED; Node++) {
 		pDCTstat = pDCTstatA + Node;
 		if (pDCTstat->DCTSysLimit) {
+			TrainDQSRdWrPos_D(pMCTstat, pDCTstat);
 			for (ChipSel = 0; ChipSel < MAX_CS_SUPPORTED; ChipSel += 2) {
-				TrainDQSRdWrPos_D(pMCTstat, pDCTstat, ChipSel);
 				SetEccDQSRdWrPos_D(pMCTstat, pDCTstat, ChipSel);
 			}
 		}
@@ -1137,27 +1219,6 @@ static void mct_SetDQSDelayCSR_D(struct MCTStatStruc *pMCTstat,
 	}
 }
 
-/*
- * mct_SetDQSDelayAllCSR_D:
- * Write the Delay value to all eight byte lanes.
- */
-static void mct_SetDQSDelayAllCSR_D(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat,
-					u8 cs_start)
-{
-	u8 ByteLane;
-	u8 ChipSel = cs_start;
-
-	for (ChipSel = cs_start; ChipSel < (cs_start + 2); ChipSel++) {
-		if ( mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, pDCTstat->Channel, ChipSel)) {
-			for (ByteLane = 0; ByteLane < 8; ByteLane++) {
-				pDCTstat->ByteLane = ByteLane;
-				mct_SetDQSDelayCSR_D(pMCTstat, pDCTstat, ChipSel);
-			}
-		}
-	}
-}
-
 u8 mct_RcvrRankEnabled_D(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat,
 				u8 Channel, u8 ChipSel)
@@ -1196,7 +1257,7 @@ u32 mct_GetMCTSysAddr_D(struct MCTStatStruc *pMCTstat,
 	reg = 0x40 + (receiver << 2) + reg_off;
 	val = Get_NB32(dev, reg);
 
-	val &= ~0x0F;
+	val &= ~0xe007c01f;
 
 	/* unganged mode DCT0+DCT1, sys addr of DCT1=node
 	 * base+DctSelBaseAddr+local ca base*/
@@ -1277,6 +1338,7 @@ exitGetAddrWNoError:
 	print_debug_dqs("mct_GetMCTSysAddr_D: base_addr ", val, 2);
 	print_debug_dqs("mct_GetMCTSysAddr_D: valid ", *valid, 2);
 	print_debug_dqs("mct_GetMCTSysAddr_D: status ", pDCTstat->Status, 2);
+	print_debug_dqs("mct_GetMCTSysAddr_D: SysBase ", pDCTstat->DCTSysBase, 2);
 	print_debug_dqs("mct_GetMCTSysAddr_D: HoleBase ", pDCTstat->DCTHoleBase, 2);
 	print_debug_dqs("mct_GetMCTSysAddr_D: Cachetop ", pMCTstat->Sub4GCacheTop, 2);
 
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c b/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c
index 528c782..60bc01d 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mcthwl.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -25,7 +26,6 @@ static void EnableZQcalibration(struct MCTStatStruc *pMCTstat, struct DCTStatStr
 static void DisableZQcalibration(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat);
 static void PrepareC_MCT(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat);
 static void PrepareC_DCT(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, u8 dct);
-static void MultiplyDelay(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat, u8 dct);
 static void Restore_OnDimmMirror(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat);
 static void Clear_OnDimmMirror(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat);
 
@@ -154,7 +154,6 @@ static void PhyWLPass2(struct MCTStatStruc *pMCTstat,
 		Clear_OnDimmMirror(pMCTstat, pDCTstat);
 		SetDllSpeedUp_D(pMCTstat, pDCTstat, dct);
 		DisableAutoRefresh_D(pMCTstat, pDCTstat);
-		MultiplyDelay(pMCTstat, pDCTstat, dct);
 		for (dimm = 0; dimm < MAX_DIMMS_SUPPORTED; dimm ++) {
 			if (DIMMValid & (1 << (dimm << 1)))
 				AgesaHwWlPhase1(pDCTstat->C_MCTPtr, pDCTstat->C_DCTPtr[dct], dimm, SecondPass);
@@ -162,6 +161,9 @@ static void PhyWLPass2(struct MCTStatStruc *pMCTstat,
 	}
 }
 
+/* Write Levelization Training
+ * Algorithm detailed in the Fam10h BKDG Rev. 3.62 section 2.8.9.9.1
+ */
 static void WriteLevelization_HW(struct MCTStatStruc *pMCTstat,
 					struct DCTStatStruc *pDCTstat)
 {
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctmtr_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mctmtr_d.c
index 3d625de..596fb23 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctmtr_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctmtr_d.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -201,12 +202,13 @@ static void SetMTRRrange_D(u32 Base, u32 *pLimit, u32 *pMtrrAddr, u16 MtrrType)
 
 void UMAMemTyping_D(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstatA)
 {
-/* UMA memory size may need splitting the MTRR configuration into two
-  Before training use NB_BottomIO or the physical memory size to set the MTRRs.
-  After training, add UMAMemTyping function to reconfigure the MTRRs based on
-  NV_BottomUMA (for UMA systems only).
-  This two-step process allows all memory to be cached for training
-*/
+	/* UMA memory size may need splitting the MTRR configuration into two
+	 * Before training use NB_BottomIO or the physical memory size to set the MTRRs.
+	 * After training, add UMAMemTyping function to reconfigure the MTRRs based on
+	 * NV_BottomUMA (for UMA systems only).
+	 * This two-step process allows all memory to be cached for training
+	*/
+
 	u32 Bottom32bIO, Cache32bTOP;
 	u32 val;
 	u32 addr;
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctndi_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mctndi_d.c
index 013a1b9..6f97061 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctndi_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctndi_d.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -140,7 +141,7 @@ void InterleaveNodes_D(struct MCTStatStruc *pMCTstat,
 	}
 
 	if (DoIntlv) {
-		MCTMemClr_D(pMCTstat,pDCTstatA);
+		MCTMemClr_D(pMCTstat, pDCTstatA);
 		/* Program Interleaving enabled on Node 0 map only.*/
 		MemSize0 <<= bsf(Nodes);	/* MemSize=MemSize*2 (or 4, or 8) */
 		Dct0MemSize <<= bsf(Nodes);
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctproc.c b/src/northbridge/amd/amdmct/mct_ddr3/mctproc.c
index da2f372..cda9c6b 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctproc.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctproc.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -36,10 +37,10 @@ u32 mct_SetDramConfigMisc2(struct DCTStatStruc *pDCTstat, u8 dct, u32 misc2)
 		val = Get_NB32(pDCTstat->dev_dct, dct * 0x100 + 0x78);
 
 		val &= 7;
-		val = ((~val) & 0xFF) + 1;
+		val = ((~val) & 0xff) + 1;
 		val += 6;
-		val &= 0xFF;
-		misc2 &= 0xFFF8FFFF;
+		val &= 0x7;
+		misc2 &= 0xfff8ffff;
 		misc2 |= val << 16;	/* DataTxFifoWrDly */
 		if (pDCTstat->LogicalCPUID & AMD_DR_Dx)
 			misc2 |= 1 << 7; /* ProgOdtEn */
@@ -52,11 +53,15 @@ void mct_ExtMCTConfig_Cx(struct DCTStatStruc *pDCTstat)
 	u32 val;
 
 	if (pDCTstat->LogicalCPUID & (AMD_DR_Cx)) {
-		Set_NB32(pDCTstat->dev_dct, 0x11C, 0x0CE00FC0 | 1 << 29/* FlushWrOnStpGnt */);
+		/* Revision C */
+		Set_NB32(pDCTstat->dev_dct, 0x11c, 0x0ce00fc0 | 1 << 29/* FlushWrOnStpGnt */);
+	}
 
-		val = Get_NB32(pDCTstat->dev_dct, 0x1B0);
-		val &= 0xFFFFF8C0;
+	if (pDCTstat->LogicalCPUID & (AMD_DR_Cx)) {
+		val = Get_NB32(pDCTstat->dev_dct, 0x1b0);
+		val &= ~0x73f;
 		val |= 0x101;	/* BKDG recommended settings */
-		Set_NB32(pDCTstat->dev_dct, 0x1B0, val);
+
+		Set_NB32(pDCTstat->dev_dct, 0x1b0, val);
 	}
 }
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c b/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c
index 6de2f4e..b21b96a 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctsdi.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -172,6 +173,7 @@ static u32 mct_MR1(struct MCTStatStruc *pMCTstat,
 			ret |= 1 << 11;
 	}
 
+	/* program MrsAddress[12]=QOFF: based on F2x[1,0]84[Qoff] */
 	if (dword & (1 << 13))
 		ret |= 1 << 12;
 
@@ -199,7 +201,8 @@ static u32 mct_MR0(struct MCTStatStruc *pMCTstat,
 	/* program MrsAddress[6:4,2]=read CAS latency
 	   (CL):based on F2x[1,0]88[Tcl] */
 	dword2 = Get_NB32(dev, reg_off + 0x88);
-	ret |= (dword2 & 0xF) << 4; /* F2x88[3:0] to MrsAddress[6:4,2]=xxx0b */
+	ret |= (dword2 & 0x7) << 4;		/* F2x88[2:0] to MrsAddress[6:4] */
+	ret |= ((dword2 & 0x8) >> 3) << 2;	/* F2x88[3] to MrsAddress[2] */
 
 	/* program MrsAddress[12]=0 (PPD):slow exit */
 	if (dword & (1 << 23))
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
index 8e5c268..91e8f77 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -24,25 +25,13 @@
 
 static void dqsTrainRcvrEn_SW(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat, u8 Pass);
-static u8 mct_SavePassRcvEnDly_D(struct DCTStatStruc *pDCTstat,
-					u8 rcvrEnDly, u8 Channel,
-					u8 receiver, u8 Pass);
-static u8 mct_CompareTestPatternQW0_D(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat,
-					u32 addr, u8 channel,
-					u8 pattern, u8 Pass);
 static void mct_InitDQSPos4RcvrEn_D(struct MCTStatStruc *pMCTstat,
 					 struct DCTStatStruc *pDCTstat);
 static void InitDQSPos4RcvrEn_D(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat, u8 Channel);
 static void CalcEccDQSRcvrEn_D(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat, u8 Channel);
-static void mct_SetFinalRcvrEnDly_D(struct DCTStatStruc *pDCTstat,
-				u8 RcvrEnDly, u8 where,
-				u8 Channel, u8 Receiver,
-				u32 dev, u32 index_reg,
-				u8 Addl_Index, u8 Pass);
-static void mct_SetMaxLatency_D(struct DCTStatStruc *pDCTstat, u8 Channel, u8 DQSRcvEnDly);
+static void mct_SetMaxLatency_D(struct DCTStatStruc *pDCTstat, u8 Channel, u16 DQSRcvEnDly);
 static void fenceDynTraining_D(struct MCTStatStruc *pMCTstat,
 			struct DCTStatStruc *pDCTstat, u8 dct);
 static void mct_DisableDQSRcvEn_D(struct DCTStatStruc *pDCTstat);
@@ -50,17 +39,17 @@ static void mct_DisableDQSRcvEn_D(struct DCTStatStruc *pDCTstat);
 /* Warning:  These must be located so they do not cross a logical 16-bit
    segment boundary! */
 static const u32 TestPattern0_D[] = {
-	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
-	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
-	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
-	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
-};
-static const u32 TestPattern1_D[] = {
 	0x55555555, 0x55555555, 0x55555555, 0x55555555,
 	0x55555555, 0x55555555, 0x55555555, 0x55555555,
 	0x55555555, 0x55555555, 0x55555555, 0x55555555,
 	0x55555555, 0x55555555, 0x55555555, 0x55555555,
 };
+static const u32 TestPattern1_D[] = {
+	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
+	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
+	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
+	0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
+};
 static const u32 TestPattern2_D[] = {
 	0x12345678, 0x87654321, 0x23456789, 0x98765432,
 	0x59385824, 0x30496724, 0x24490795, 0x99938733,
@@ -104,16 +93,87 @@ void mct_TrainRcvrEn_D(struct MCTStatStruc *pMCTstat,
 		dqsTrainRcvrEn_SW(pMCTstat, pDCTstat, Pass);
 }
 
+static void read_dqs_write_timing_control_registers(uint16_t* current_total_delay, uint32_t dev, uint8_t dimm, uint32_t index_reg)
+{
+	uint8_t lane;
+	uint32_t dword;
+
+	for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
+		uint32_t wdt_reg;
+		if ((lane == 0) || (lane == 1))
+			wdt_reg = 0x30;
+		if ((lane == 2) || (lane == 3))
+			wdt_reg = 0x31;
+		if ((lane == 4) || (lane == 5))
+			wdt_reg = 0x40;
+		if ((lane == 6) || (lane == 7))
+			wdt_reg = 0x41;
+		if (lane == 8)
+			wdt_reg = 0x32;
+		wdt_reg += dimm * 3;
+		dword = Get_NB32_index_wait(dev, index_reg, wdt_reg);
+		if ((lane == 7) || (lane == 5) || (lane == 3) || (lane == 1))
+			current_total_delay[lane] = (dword & 0x00ff0000) >> 16;
+		if ((lane == 8) || (lane == 6) || (lane == 4) || (lane == 2) || (lane == 0))
+			current_total_delay[lane] = dword & 0x000000ff;
+	}
+}
+
+static void write_dqs_receiver_enable_control_registers(uint16_t* current_total_delay, uint32_t dev, uint8_t dimm, uint32_t index_reg)
+{
+	uint8_t lane;
+	uint32_t dword;
+
+	for (lane = 0; lane < 8; lane++) {
+		uint32_t ret_reg;
+		if ((lane == 0) || (lane == 1))
+			ret_reg = 0x10;
+		if ((lane == 2) || (lane == 3))
+			ret_reg = 0x11;
+		if ((lane == 4) || (lane == 5))
+			ret_reg = 0x20;
+		if ((lane == 6) || (lane == 7))
+			ret_reg = 0x21;
+		ret_reg += dimm * 3;
+		dword = Get_NB32_index_wait(dev, index_reg, ret_reg);
+		if ((lane == 7) || (lane == 5) || (lane == 3) || (lane == 1)) {
+			dword &= ~(0x1ff << 16);
+			dword |= (current_total_delay[lane] & 0x1ff) << 16;
+		}
+		if ((lane == 6) || (lane == 4) || (lane == 2) || (lane == 0)) {
+			dword &= ~0x1ff;
+			dword |= current_total_delay[lane] & 0x1ff;
+		}
+		Set_NB32_index_wait(dev, index_reg, ret_reg, dword);
+	}
+}
+
+static uint32_t convert_testaddr_and_channel_to_address(struct DCTStatStruc *pDCTstat, uint32_t testaddr, uint8_t channel)
+{
+	SetUpperFSbase(testaddr);
+	testaddr <<= 8;
+
+	if((pDCTstat->Status & (1<<SB_128bitmode)) && channel ) {
+		testaddr += 8;	/* second channel */
+	}
+
+	return testaddr;
+}
+
+/* DQS Receiver Enable Training
+ * Algorithm detailed in the Fam10h BKDG Rev. 3.62 section 2.8.9.9.2
+ */
 static void dqsTrainRcvrEn_SW(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat, u8 Pass)
 {
-	u8 Channel, RcvrEnDly, RcvrEnDlyRmin;
-	u8 Test0, Test1, CurrTest, CurrTestSide0, CurrTestSide1;
-	u8 CTLRMaxDelay, _2Ranks, PatternA, PatternB;
+	u8 Channel;
+	u8 _2Ranks;
 	u8 Addl_Index = 0;
 	u8 Receiver;
 	u8 _DisableDramECC = 0, _Wrap32Dis = 0, _SSE2 = 0;
-	u8 RcvrEnDlyLimit, Final_Value, MaxDelay_CH[2];
+	u8 Final_Value;
+	u16 CTLRMaxDelay;
+	u16 MaxDelay_CH[2];
 	u32 TestAddr0, TestAddr1, TestAddr0B, TestAddr1B;
 	u32 PatternBuffer[64+4]; /* FIXME: need increase 8? */
 	u32 Errors;
@@ -127,9 +187,20 @@ static void dqsTrainRcvrEn_SW(struct MCTStatStruc *pMCTstat,
 	u32 cr4;
 	u32 lo, hi;
 
+	uint32_t dword;
+	uint8_t rank;
+	uint8_t lane;
+	uint16_t current_total_delay[MAX_BYTE_LANES];
+	uint16_t candidate_total_delay[8];
+	uint8_t data_test_pass_sr[2][8];	/* [rank][lane] */
+	uint8_t data_test_pass[8];		/* [lane] */
+	uint8_t data_test_pass_prev[8];		/* [lane] */
+	uint8_t window_det_toggle[8];
+	uint8_t trained[8];
+	uint64_t result_qword1;
+	uint64_t result_qword2;
+
 	u8 valid;
-	u32 tmp;
-	u8 LastTest;
 
 	print_debug_dqs("\nTrainRcvEn: Node", pDCTstat->Node_ID, 0);
 	print_debug_dqs("TrainRcvEn: Pass", Pass, 0);
@@ -181,33 +252,103 @@ static void dqsTrainRcvrEn_SW(struct MCTStatStruc *pMCTstat,
 
 	Errors = 0;
 	dev = pDCTstat->dev_dct;
-	CTLRMaxDelay = 0;
 
 	for (Channel = 0; Channel < 2; Channel++) {
 		print_debug_dqs("\tTrainRcvEn51: Node ", pDCTstat->Node_ID, 1);
 		print_debug_dqs("\tTrainRcvEn51: Channel ", Channel, 1);
 		pDCTstat->Channel = Channel;
 
+		CTLRMaxDelay = 0;
 		MaxDelay_CH[Channel] = 0;
 		index_reg = 0x98 + 0x100 * Channel;
 
 		Receiver = mct_InitReceiver_D(pDCTstat, Channel);
-		/* There are four receiver pairs, loosely associated with chipselects. */
+		/* There are four receiver pairs, loosely associated with chipselects.
+		 * This is essentially looping over each DIMM.
+		 */
 		for (; Receiver < 8; Receiver += 2) {
 			Addl_Index = (Receiver >> 1) * 3 + 0x10;
-			LastTest = DQS_FAIL;
-
-			/* mct_ModifyIndex_D */
-			RcvrEnDlyRmin = RcvrEnDlyLimit = 0xff;
 
 			print_debug_dqs("\t\tTrainRcvEnd52: index ", Addl_Index, 2);
 
-			if(!mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, Channel, Receiver)) {
+			if (!mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, Channel, Receiver)) {
 				continue;
 			}
 
+			/* Clear data structures */
+			for (lane = 0; lane < 8; lane++) {
+				data_test_pass_prev[lane] = 0;
+				trained[lane] = 0;
+			}
+
+			/* 2.8.9.9.2 (1, 6)
+			 * Retrieve gross and fine timing fields from write DQS registers
+			 */
+			read_dqs_write_timing_control_registers(current_total_delay, dev, (Receiver >> 1), index_reg);
+
+			/* 2.8.9.9.2 (1)
+			 * Program the Write Data Timing and Write ECC Timing register to
+			 * the values stored in the DQS Write Timing Control register
+			 * for each lane
+			 */
+			for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
+				uint32_t wdt_reg;
+
+				/* Calculate Write Data Timing register location */
+				if ((lane == 0) || (lane == 1) || (lane == 2) || (lane == 3))
+					wdt_reg = 0x1;
+				if ((lane == 4) || (lane == 5) || (lane == 6) || (lane == 7))
+					wdt_reg = 0x2;
+				if (lane == 8)
+					wdt_reg = 0x3;
+				wdt_reg |= ((Receiver / 2) << 8);
+
+				/* Set Write Data Timing register values */
+				dword = Get_NB32_index_wait(dev, index_reg, wdt_reg);
+				if ((lane == 7) || (lane == 3)) {
+					dword &= ~(0x7f << 24);
+					dword |= (current_total_delay[lane] & 0x7f) << 24;
+				}
+				if ((lane == 6) || (lane == 2)) {
+					dword &= ~(0x7f << 16);
+					dword |= (current_total_delay[lane] & 0x7f) << 16;
+				}
+				if ((lane == 5) || (lane == 1)) {
+					dword &= ~(0x7f << 8);
+					dword |= (current_total_delay[lane] & 0x7f) << 8;
+				}
+				if ((lane == 8) || (lane == 4) || (lane == 0)) {
+					dword &= ~0x7f;
+					dword |= current_total_delay[lane] & 0x7f;
+				}
+				Set_NB32_index_wait(dev, index_reg, wdt_reg, dword);
+			}
+
+			/* 2.8.9.9.2 (2)
+			 * Program the Read DQS Timing Control and the Read DQS ECC Timing Control registers
+			 * to 1/2 MEMCLK for all lanes
+			 */
+			for (lane = 0; lane < MAX_BYTE_LANES; lane++) {
+				uint32_t rdt_reg;
+				if ((lane == 0) || (lane == 1) || (lane == 2) || (lane == 3))
+					rdt_reg = 0x5;
+				if ((lane == 4) || (lane == 5) || (lane == 6) || (lane == 7))
+					rdt_reg = 0x6;
+				if (lane == 8)
+					rdt_reg = 0x7;
+				rdt_reg |= ((Receiver / 2) << 8);
+				if (lane == 8)
+					dword = 0x0000003f;
+				else
+					dword = 0x3f3f3f3f;
+				Set_NB32_index_wait(dev, index_reg, rdt_reg, dword);
+			}
+
+			/* 2.8.9.9.2 (3)
+			 * Select two test addresses for each rank present
+			 */
 			TestAddr0 = mct_GetRcvrSysAddr_D(pMCTstat, pDCTstat, Channel, Receiver, &valid);
-			if(!valid) {	/* Address not supported on current CS */
+			if (!valid) {	/* Address not supported on current CS */
 				continue;
 			}
 
@@ -229,171 +370,214 @@ static void dqsTrainRcvrEn_SW(struct MCTStatStruc *pMCTstat,
 			print_debug_dqs("\t\tTrainRcvEn53: TestAddr1 ", TestAddr1, 2);
 			print_debug_dqs("\t\tTrainRcvEn53: TestAddr1B ", TestAddr1B, 2);
 
-			/*
-			 * Get starting RcvrEnDly value
+			/* 2.8.9.9.2 (4, 5)
+			 * Write 1 cache line of the appropriate test pattern to each test addresse
 			 */
-			RcvrEnDly = mct_Get_Start_RcvrEnDly_1Pass(Pass);
+			mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr0, 0); /* rank 0 of DIMM, testpattern 0 */
+			mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr0B, 1); /* rank 0 of DIMM, testpattern 1 */
+			if (_2Ranks) {
+				mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr1, 0); /*rank 1 of DIMM, testpattern 0 */
+				mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr1B, 1); /*rank 1 of DIMM, testpattern 1 */
+			}
 
-			/* mct_GetInitFlag_D*/
-			if (Pass == FirstPass) {
-				pDCTstat->DqsRcvEn_Pass = 0;
-			} else {
-				pDCTstat->DqsRcvEn_Pass=0xFF;
+#if DQS_TRAIN_DEBUG > 0
+			for (lane = 0; lane < 8; lane++) {
+				print_debug_dqs("\t\tTrainRcvEn54: lane: ", lane, 2);
+				print_debug_dqs("\t\tTrainRcvEn54: current_total_delay ", current_total_delay[lane], 2);
 			}
-			pDCTstat->DqsRcvEn_Saved = 0;
+#endif
 
+			/* 2.8.9.9.2 (6)
+			 * Write gross and fine timing fields to read DQS registers
+			 */
+			write_dqs_receiver_enable_control_registers(current_total_delay, dev, (Receiver >> 1), index_reg);
+
+			/* 2.8.9.9.2 (7)
+			 * Loop over all delay values up to 1 MEMCLK (0x40 delay steps) from the initial delay values
+			 *
+			 * FIXME
+			 * It is not clear if training should be discontinued if any test failures occur in the first
+			 * 1 MEMCLK window, or if it should be discontinued if no successes occur in the first 1 MEMCLK
+			 * window.  Therefore, loop over up to 2 MEMCLK (0x80 delay steps) to be on the safe side.
+			 */
+			uint16_t current_delay_step;
 
-			while(RcvrEnDly < RcvrEnDlyLimit) {	/* sweep Delay value here */
-				print_debug_dqs("\t\t\tTrainRcvEn541: RcvrEnDly ", RcvrEnDly, 3);
+			for (current_delay_step = 0; current_delay_step < 0x80; current_delay_step++) {
+				print_debug_dqs("\t\t\tTrainRcvEn541: current_delay_step ", current_delay_step, 3);
 
-				/* callback not required
-				if(mct_AdjustDelay_D(pDCTstat, RcvrEnDly))
-					goto skipDly;
+				/* 2.8.9.9.2 (7 D)
+				* Terminate if all lanes are trained
 				*/
+				uint8_t all_lanes_trained = 1;
+				for (lane = 0; lane < 8; lane++)
+					if (!trained[lane])
+						all_lanes_trained = 0;
 
-				/* Odd steps get another pattern such that even
-				 and odd steps alternate. The pointers to the
-				 patterns will be swaped at the end of the loop
-				 so that they correspond. */
-				if(RcvrEnDly & 1) {
-					PatternA = 1;
-					PatternB = 0;
-				} else {
-					/* Even step */
-					PatternA = 0;
-					PatternB = 1;
-				}
-
-				mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr0, PatternA); /* rank 0 of DIMM, testpattern 0 */
-				mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr0B, PatternB); /* rank 0 of DIMM, testpattern 1 */
-				if(_2Ranks) {
-					mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr1, PatternA); /*rank 1 of DIMM, testpattern 0 */
-					mct_Write1LTestPattern_D(pMCTstat, pDCTstat, TestAddr1B, PatternB); /*rank 1 of DIMM, testpattern 1 */
-				}
-
-				mct_SetRcvrEnDly_D(pDCTstat, RcvrEnDly, 0, Channel, Receiver, dev, index_reg, Addl_Index, Pass);
-
-				CurrTest = DQS_FAIL;
-				CurrTestSide0 = DQS_FAIL;
-				CurrTestSide1 = DQS_FAIL;
-
-				mct_Read1LTestPattern_D(pMCTstat, pDCTstat, TestAddr0);	/*cache fills */
-				Test0 = mct_CompareTestPatternQW0_D(pMCTstat, pDCTstat, TestAddr0, Channel, PatternA, Pass);/* ROM vs cache compare */
-				proc_IOCLFLUSH_D(TestAddr0);
-				ResetDCTWrPtr_D(dev, index_reg, Addl_Index);
-
-				print_debug_dqs("\t\t\tTrainRcvEn542: Test0 result ", Test0, 3);
-
-				/* != 0x00 mean pass */
-
-				if(Test0 == DQS_PASS) {
-					mct_Read1LTestPattern_D(pMCTstat, pDCTstat, TestAddr0B);	/*cache fills */
-					/* ROM vs cache compare */
-					Test1 = mct_CompareTestPatternQW0_D(pMCTstat, pDCTstat, TestAddr0B, Channel, PatternB, Pass);
-					proc_IOCLFLUSH_D(TestAddr0B);
-					ResetDCTWrPtr_D(dev, index_reg, Addl_Index);
-
-					print_debug_dqs("\t\t\tTrainRcvEn543: Test1 result ", Test1, 3);
+				if (all_lanes_trained)
+					break;
 
-					if(Test1 == DQS_PASS) {
-						CurrTestSide0 = DQS_PASS;
+				/* 2.8.9.9.2 (7 A)
+				* Loop over all ranks
+				*/
+				for (rank = 0; rank < (_2Ranks + 1); rank++) {
+					/* 2.8.9.9.2 (7 A a-d)
+					 * Read the first test address of the current rank
+					 * Store the first data beat for analysis
+					 * Reset read pointer in the DRAM controller FIFO
+					 * Read the second test address of the current rank
+					 * Store the first data beat for analysis
+					 * Reset read pointer in the DRAM controller FIFO
+					 */
+					if (rank & 1) {
+						/* 2.8.9.9.2 (7 D)
+						 * Invert read instructions to alternate data read order on the bus
+						 */
+						proc_IOCLFLUSH_D((rank == 0)?TestAddr0B:TestAddr1B);
+						result_qword2 = read64_fs(convert_testaddr_and_channel_to_address(pDCTstat, (rank == 0)?TestAddr0B:TestAddr1B, Channel));
+						write_dqs_receiver_enable_control_registers(current_total_delay, dev, (Receiver >> 1), index_reg);
+						proc_IOCLFLUSH_D((rank == 0)?TestAddr0:TestAddr1);
+						result_qword1 = read64_fs(convert_testaddr_and_channel_to_address(pDCTstat, (rank == 0)?TestAddr0:TestAddr1, Channel));
+						write_dqs_receiver_enable_control_registers(current_total_delay, dev, (Receiver >> 1), index_reg);
+					} else {
+						proc_IOCLFLUSH_D((rank == 0)?TestAddr0:TestAddr1);
+						result_qword1 = read64_fs(convert_testaddr_and_channel_to_address(pDCTstat, (rank == 0)?TestAddr0:TestAddr1, Channel));
+						write_dqs_receiver_enable_control_registers(current_total_delay, dev, (Receiver >> 1), index_reg);
+						proc_IOCLFLUSH_D((rank == 0)?TestAddr0B:TestAddr1B);
+						result_qword2 = read64_fs(convert_testaddr_and_channel_to_address(pDCTstat, (rank == 0)?TestAddr0B:TestAddr1B, Channel));
+						write_dqs_receiver_enable_control_registers(current_total_delay, dev, (Receiver >> 1), index_reg);
 					}
-				}
-				if(_2Ranks) {
-					mct_Read1LTestPattern_D(pMCTstat, pDCTstat, TestAddr1);	/*cache fills */
-					/* ROM vs cache compare */
-					Test0 = mct_CompareTestPatternQW0_D(pMCTstat, pDCTstat, TestAddr1, Channel, PatternA, Pass);
-					proc_IOCLFLUSH_D(TestAddr1);
-					ResetDCTWrPtr_D(dev, index_reg, Addl_Index);
-
-					print_debug_dqs("\t\t\tTrainRcvEn544: Test0 result ", Test0, 3);
-
-					if(Test0 == DQS_PASS) {
-						mct_Read1LTestPattern_D(pMCTstat, pDCTstat, TestAddr1B);	/*cache fills */
-						/* ROM vs cache compare */
-						Test1 = mct_CompareTestPatternQW0_D(pMCTstat, pDCTstat, TestAddr1B, Channel, PatternB, Pass);
-						proc_IOCLFLUSH_D(TestAddr1B);
-						ResetDCTWrPtr_D(dev, index_reg, Addl_Index);
-
-						print_debug_dqs("\t\t\tTrainRcvEn545: Test1 result ", Test1, 3);
-						if(Test1 == DQS_PASS) {
-							CurrTestSide1 = DQS_PASS;
+					/* 2.8.9.9.2 (7 A e)
+					 * Compare both read patterns and flag passing ranks/lanes
+					 */
+					uint8_t result_lane_byte1;
+					uint8_t result_lane_byte2;
+					for (lane = 0; lane < 8; lane++) {
+						if (trained[lane] == 1) {
+#if DQS_TRAIN_DEBUG > 0
+							print_debug_dqs("\t\t\t\t\t\t\t\t lane already trained: ", lane, 4);
+#endif
+							continue;
 						}
+
+						result_lane_byte1 = (result_qword1 >> (lane * 8)) & 0xff;
+						result_lane_byte2 = (result_qword2 >> (lane * 8)) & 0xff;
+						if ((result_lane_byte1 == 0x55) && (result_lane_byte2 == 0xaa))
+							data_test_pass_sr[rank][lane] = 1;
+						else
+							data_test_pass_sr[rank][lane] = 0;
+#if DQS_TRAIN_DEBUG > 0
+						print_debug_dqs_pair("\t\t\t\t\t\t\t\t ", 0x55, "  |  ", result_lane_byte1, 4);
+						print_debug_dqs_pair("\t\t\t\t\t\t\t\t ", 0xaa, "  |  ", result_lane_byte2, 4);
+#endif
 					}
 				}
 
-				if(_2Ranks) {
-					if ((CurrTestSide0 == DQS_PASS) && (CurrTestSide1 == DQS_PASS)) {
-						CurrTest = DQS_PASS;
+				/* 2.8.9.9.2 (7 B)
+				 * If DIMM is dual rank, only use delays that pass testing for both ranks
+				 */
+				for (lane = 0; lane < 8; lane++) {
+					if (_2Ranks) {
+						if ((data_test_pass_sr[0][lane]) && (data_test_pass_sr[1][lane]))
+							data_test_pass[lane] = 1;
+						else
+							data_test_pass[lane] = 0;
+					} else {
+						data_test_pass[lane] = data_test_pass_sr[0][lane];
 					}
-				} else if (CurrTestSide0 == DQS_PASS) {
-					CurrTest = DQS_PASS;
 				}
 
-				/* record first pass DqsRcvEn to stack */
-				valid = mct_SavePassRcvEnDly_D(pDCTstat, RcvrEnDly, Channel, Receiver, Pass);
+				/* 2.8.9.9.2 (7 E)
+				 * For each lane, update the DQS receiver delay setting in support of next iteration
+				 */
+				for (lane = 0; lane < 8; lane++) {
+					if (trained[lane] == 1)
+						continue;
+
+					/* 2.8.9.9.2 (7 C a)
+					 * Save the total delay of the first success after a failure for later use
+					 */
+					if ((data_test_pass[lane] == 1) && (data_test_pass_prev[lane] == 0)) {
+						candidate_total_delay[lane] = current_total_delay[lane];
+						window_det_toggle[lane] = 0;
+					}
 
-				/* Break(1:RevF,2:DR) or not(0) FIXME: This comment deosn't make sense */
-				if(valid == 2 || (LastTest == DQS_FAIL && valid == 1)) {
-					RcvrEnDlyRmin = RcvrEnDly;
-					break;
+					/* 2.8.9.9.2 (7 C b)
+					 * If the current delay failed testing add 1/8 UI to the current delay
+					 */
+					if (data_test_pass[lane] == 0)
+						current_total_delay[lane] += 0x4;
+
+					/* 2.8.9.9.2 (7 C c)
+					 * If the current delay passed testing alternately add either 1/32 UI or 1/4 UI to the current delay
+					 * If 1.25 UI of delay have been added with no failures the lane is considered trained
+					 */
+					if (data_test_pass[lane] == 1) {
+						/* See if lane is trained */
+						if ((current_total_delay[lane] - candidate_total_delay[lane]) >= 0x28) {
+							trained[lane] = 1;
+
+							/* Calculate and set final lane delay value
+							 * The final delay is the candidate delay + 7/8 UI
+							 */
+							current_total_delay[lane] = candidate_total_delay[lane] + 0x1c;
+						} else {
+							if (window_det_toggle[lane] == 0) {
+								current_total_delay[lane] += 0x1;
+								window_det_toggle[lane] = 1;
+							} else {
+								current_total_delay[lane] += 0x8;
+								window_det_toggle[lane] = 0;
+							}
+						}
+					}
 				}
 
-				LastTest = CurrTest;
-
-				/* swap the rank 0 pointers */
-				tmp = TestAddr0;
-				TestAddr0 = TestAddr0B;
-				TestAddr0B = tmp;
-
-				/* swap the rank 1 pointers */
-				tmp = TestAddr1;
-				TestAddr1 = TestAddr1B;
-				TestAddr1B = tmp;
-
-				print_debug_dqs("\t\t\tTrainRcvEn56: RcvrEnDly ", RcvrEnDly, 3);
+				/* Update delays in hardware */
+				write_dqs_receiver_enable_control_registers(current_total_delay, dev, (Receiver >> 1), index_reg);
 
-				RcvrEnDly++;
-
-			}	/* while RcvrEnDly */
-
-			print_debug_dqs("\t\tTrainRcvEn61: RcvrEnDly ", RcvrEnDly, 2);
-			print_debug_dqs("\t\tTrainRcvEn61: RcvrEnDlyRmin ", RcvrEnDlyRmin, 3);
-			print_debug_dqs("\t\tTrainRcvEn61: RcvrEnDlyLimit ", RcvrEnDlyLimit, 3);
-			if(RcvrEnDlyRmin == RcvrEnDlyLimit) {
-				/* no passing window */
-				pDCTstat->ErrStatus |= 1 << SB_NORCVREN;
-				Errors |= 1 << SB_NORCVREN;
-				pDCTstat->ErrCode = SC_FatalErr;
+				/* Save previous results for comparison in the next iteration */
+				for (lane = 0; lane < 8; lane++)
+					data_test_pass_prev[lane] = data_test_pass[lane];
 			}
 
-			if(RcvrEnDly > (RcvrEnDlyLimit - 1)) {
-				/* passing window too narrow, too far delayed*/
-				pDCTstat->ErrStatus |= 1 << SB_SmallRCVR;
-				Errors |= 1 << SB_SmallRCVR;
-				pDCTstat->ErrCode = SC_FatalErr;
-				RcvrEnDly = RcvrEnDlyLimit - 1;
-				pDCTstat->CSTrainFail |= 1 << Receiver;
-				pDCTstat->DimmTrainFail |= 1 << (Receiver + Channel);
-			}
-
-			/* CHB_D0_B0_RCVRDLY set in mct_Average_RcvrEnDly_Pass */
-			mct_Average_RcvrEnDly_Pass(pDCTstat, RcvrEnDly, RcvrEnDlyLimit, Channel, Receiver, Pass);
+#if DQS_TRAIN_DEBUG > 0
+			for (lane = 0; lane < 8; lane++)
+				print_debug_dqs_pair("\t\tTrainRcvEn55: Lane ", lane, " current_total_delay ", current_total_delay[lane], 2);
+#endif
 
-			mct_SetFinalRcvrEnDly_D(pDCTstat, RcvrEnDly, Final_Value, Channel, Receiver, dev, index_reg, Addl_Index, Pass);
+			/* Find highest delay value and save for later use */
+			for (lane = 0; lane < 8; lane++)
+				if (current_total_delay[lane] > CTLRMaxDelay)
+					CTLRMaxDelay = current_total_delay[lane];
 
-			if(pDCTstat->ErrStatus & (1 << SB_SmallRCVR)) {
-				Errors |= 1 << SB_SmallRCVR;
+			/* See if any lanes failed training, and set error flags appropriately
+			 * For all trained lanes, save delay values for later use
+			 */
+			for (lane = 0; lane < 8; lane++) {
+				if (trained[lane]) {
+                        		pDCTstat->CH_D_B_RCVRDLY[Channel][Receiver >> 1][lane] = current_total_delay[lane];
+				} else {
+					printk(BIOS_WARNING, "TrainRcvrEn: WARNING: Lane %d of receiver %d on channel %d failed training!\n", lane, Receiver, Channel);
+
+					/* Set error flags */
+					pDCTstat->ErrStatus |= 1 << SB_NORCVREN;
+					Errors |= 1 << SB_NORCVREN;
+					pDCTstat->ErrCode = SC_FatalErr;
+					pDCTstat->CSTrainFail |= 1 << Receiver;
+					pDCTstat->DimmTrainFail |= 1 << (Receiver + Channel);
+				}
 			}
 
-			RcvrEnDly += Pass1MemClkDly;
-			if(RcvrEnDly > CTLRMaxDelay) {
-				CTLRMaxDelay = RcvrEnDly;
-			}
+			/* 2.8.9.9.2 (8)
+			 * Flush the receiver FIFO
+			 * Write one full cache line of non-0x55/0xaa data to one of the test addresses, then read it back to flush the FIFO
+			 */
 
-		}	/* while Receiver */
+			WriteLNTestPattern(TestAddr0 << 8, (uint8_t *)TestPattern2_D, 1);
+			mct_Read1LTestPattern_D(pMCTstat, pDCTstat, TestAddr0);
+		}
 		MaxDelay_CH[Channel] = CTLRMaxDelay;
-	}	/* for Channel */
+	}
 
 	CTLRMaxDelay = MaxDelay_CH[0];
 	if (MaxDelay_CH[1] > CTLRMaxDelay)
@@ -428,31 +612,31 @@ static void dqsTrainRcvrEn_SW(struct MCTStatStruc *pMCTstat,
 
 #if DQS_TRAIN_DEBUG > 0
 	{
-		u8 Channel;
+		u8 ChannelDTD;
 		printk(BIOS_DEBUG, "TrainRcvrEn: CH_MaxRdLat:\n");
-		for(Channel = 0; Channel<2; Channel++) {
+		for(ChannelDTD = 0; ChannelDTD<2; ChannelDTD++) {
 			printk(BIOS_DEBUG, "Channel:%x: %x\n",
-			       Channel, pDCTstat->CH_MaxRdLat[Channel]);
+			       ChannelDTD, pDCTstat->CH_MaxRdLat[ChannelDTD]);
 		}
 	}
 #endif
 
 #if DQS_TRAIN_DEBUG > 0
 	{
-		u8 val;
-		u8 Channel, Receiver;
+		u16 valDTD;
+		u8 ChannelDTD, ReceiverDTD;
 		u8 i;
-		u8 *p;
+		u16 *p;
 
 		printk(BIOS_DEBUG, "TrainRcvrEn: CH_D_B_RCVRDLY:\n");
-		for(Channel = 0; Channel < 2; Channel++) {
-			printk(BIOS_DEBUG, "Channel:%x\n", Channel);
-			for(Receiver = 0; Receiver<8; Receiver+=2) {
-				printk(BIOS_DEBUG, "\t\tReceiver:%x:", Receiver);
-				p = pDCTstat->CH_D_B_RCVRDLY[Channel][Receiver>>1];
+		for(ChannelDTD = 0; ChannelDTD < 2; ChannelDTD++) {
+			printk(BIOS_DEBUG, "Channel:%x\n", ChannelDTD);
+			for(ReceiverDTD = 0; ReceiverDTD<8; ReceiverDTD+=2) {
+				printk(BIOS_DEBUG, "\t\tReceiver:%x:", ReceiverDTD);
+				p = pDCTstat->CH_D_B_RCVRDLY[ChannelDTD][ReceiverDTD>>1];
 				for (i=0;i<8; i++) {
-					val  = p[i];
-					printk(BIOS_DEBUG, "%x ", val);
+					valDTD = p[i];
+					printk(BIOS_DEBUG, " %03x", valDTD);
 				}
 				printk(BIOS_DEBUG, "\n");
 			}
@@ -475,15 +659,6 @@ u8 mct_InitReceiver_D(struct DCTStatStruc *pDCTstat, u8 dct)
 	}
 }
 
-static void mct_SetFinalRcvrEnDly_D(struct DCTStatStruc *pDCTstat, u8 RcvrEnDly, u8 where, u8 Channel, u8 Receiver, u32 dev, u32 index_reg, u8 Addl_Index, u8 Pass/*, u8 *p*/)
-{
-	/*
-	 * Program final DqsRcvEnDly to additional index for DQS receiver
-	 *  enabled delay
-	 */
-	mct_SetRcvrEnDly_D(pDCTstat, RcvrEnDly, where, Channel, Receiver, dev, index_reg, Addl_Index, Pass);
-}
-
 static void mct_DisableDQSRcvEn_D(struct DCTStatStruc *pDCTstat)
 {
 	u8 ch_end, ch;
@@ -514,17 +689,20 @@ static void mct_DisableDQSRcvEn_D(struct DCTStatStruc *pDCTstat)
  * Function only used once so it was inlined.
  */
 
-void mct_SetRcvrEnDly_D(struct DCTStatStruc *pDCTstat, u8 RcvrEnDly,
+/* Set F2x[1, 0]9C_x[2B:10] DRAM DQS Receiver Enable Timing Control Registers
+ * See BKDG Rev. 3.62 page 268 for more information
+ */
+void mct_SetRcvrEnDly_D(struct DCTStatStruc *pDCTstat, u16 RcvrEnDly,
 			u8 FinalValue, u8 Channel, u8 Receiver, u32 dev,
 			u32 index_reg, u8 Addl_Index, u8 Pass)
 {
 	u32 index;
 	u8 i;
-	u8 *p;
+	u16 *p;
 	u32 val;
 
-	if(RcvrEnDly == 0xFE) {
-		/*set the boudary flag */
+	if(RcvrEnDly == 0x1fe) {
+		/*set the boundary flag */
 		pDCTstat->Status |= 1 << SB_DQSRcvLimit;
 	}
 
@@ -543,27 +721,57 @@ void mct_SetRcvrEnDly_D(struct DCTStatStruc *pDCTstat, u8 RcvrEnDly,
 		val = Get_NB32_index_wait(dev, index_reg, index);
 		if(i & 1) {
 			/* odd byte lane */
-			val &= ~(0xFF << 16);
-			val |= (RcvrEnDly << 16);
+			val &= ~(0x1ff << 16);
+			val |= ((RcvrEnDly & 0x1ff) << 16);
 		} else {
 			/* even byte lane */
-			val &= ~0xFF;
-			val |= RcvrEnDly;
+			val &= ~0x1ff;
+			val |= (RcvrEnDly & 0x1ff);
 		}
 		Set_NB32_index_wait(dev, index_reg, index, val);
 	}
 
 }
 
-static void mct_SetMaxLatency_D(struct DCTStatStruc *pDCTstat, u8 Channel, u8 DQSRcvEnDly)
+/* Calculate MaxRdLatency
+ * Algorithm detailed in the Fam10h BKDG Rev. 3.62 section 2.8.9.9.5
+ */
+static void mct_SetMaxLatency_D(struct DCTStatStruc *pDCTstat, u8 Channel, u16 DQSRcvEnDly)
 {
 	u32 dev;
 	u32 reg;
-	u16 SubTotal;
+	u32 SubTotal;
 	u32 index_reg;
 	u32 reg_off;
 	u32 val;
-	u32 valx;
+
+	uint8_t cpu_val_n;
+	uint8_t cpu_val_p;
+
+	u16 freq_tab[] = {400, 533, 667, 800};
+
+	/* Set up processor-dependent values */
+	if (pDCTstat->LogicalCPUID & AMD_DR_Dx) {
+		/* Revision D and above */
+		cpu_val_n = 4;
+		cpu_val_p = 29;
+	} else if (pDCTstat->LogicalCPUID & AMD_DR_Cx) {
+		/* Revision C */
+		uint8_t package_type = mctGet_NVbits(NV_PACK_TYPE);
+		if ((package_type == PT_L1)		/* Socket F (1207) */
+			|| (package_type == PT_M2)	/* Socket AM3 */
+			|| (package_type == PT_S1)) {	/* Socket S1g<x> */
+			cpu_val_n = 10;
+			cpu_val_p = 11;
+		} else {
+			cpu_val_n = 4;
+			cpu_val_p = 29;
+		}
+	} else {
+		/* Revision B and below */
+		cpu_val_n = 10;
+		cpu_val_p = 11;
+	}
 
 	if(pDCTstat->GangedMode)
 		Channel = 0;
@@ -598,49 +806,32 @@ static void mct_SetMaxLatency_D(struct DCTStatStruc *pDCTstat, u8 Channel, u8 DQ
 	val = Get_NB32(dev, 0x78 + reg_off);
 	SubTotal += 8 - (val & 0x0f);
 
-	/* Convert bits 7-5 (also referred to as the course delay) of
+	/* Convert bits 7-5 (also referred to as the coarse delay) of
 	 * the current (or worst case) DQS receiver enable delay to
 	 * 1/2 MEMCLKs units, rounding up, and add this to the sub-total.
 	 */
-	SubTotal += DQSRcvEnDly >> 5;	/*BOZO-no rounding up */
+	SubTotal += DQSRcvEnDly >> 5;	/* Retrieve gross delay portion of value */
 
-	/* Add 5.5 to the sub-total. 5.5 represents part of the
+	/* Add "P" to the sub-total. "P" represents part of the
 	 * processor specific constant delay value in the DRAM
 	 * clock domain.
 	 */
 	SubTotal <<= 1;		/*scale 1/2 MemClk to 1/4 MemClk */
-	SubTotal += 11;		/*add 5.5 1/2MemClk */
+	SubTotal += cpu_val_p;	/*add "P" 1/2MemClk */
+	SubTotal >>= 1;		/*scale 1/4 MemClk back to 1/2 MemClk */
 
 	/* Convert the sub-total (in 1/2 MEMCLKs) to northbridge
-	 * clocks (NCLKs) as follows (assuming DDR400 and assuming
-	 * that no P-state or link speed changes have occurred).
+	 * clocks (NCLKs)
 	 */
+	SubTotal *= 200 * ((Get_NB32(pDCTstat->dev_nbmisc, 0xd4) & 0x1f) + 4);
+	SubTotal /= freq_tab[((Get_NB32(pDCTstat->dev_dct, 0x94 + reg_off) & 0x7) - 3)];
+	SubTotal = (SubTotal + (2 - 1)) / 2;	/* Round up */
 
-	/* New formula:
-	 * SubTotal *= 3*(Fn2xD4[NBFid]+4)/(3+Fn2x94[MemClkFreq])/2 */
-	val = Get_NB32(dev, 0x94 + reg_off);
-
-	/* SubTotal div 4 to scale 1/4 MemClk back to MemClk */
-	val &= 7;
-	if (val >= 3) {
-		val <<= 1;
-	} else
-		val += 3;
-	valx = val << 2;
-
-	val = Get_NB32(pDCTstat->dev_nbmisc, 0xD4);
-	SubTotal *= ((val & 0x1f) + 4 ) * 3;
-
-	SubTotal /= valx;
-	if (SubTotal % valx) {	/* round up */
-		SubTotal++;
-	}
-
-	/* Add 5 NCLKs to the sub-total. 5 represents part of the
+	/* Add "N" NCLKs to the sub-total. "N" represents part of the
 	 * processor specific constant value in the northbridge
 	 * clock domain.
 	 */
-	SubTotal += 5;
+	SubTotal += (cpu_val_n) / 2;
 
 	pDCTstat->CH_MaxRdLat[Channel] = SubTotal;
 	if(pDCTstat->GangedMode) {
@@ -659,143 +850,6 @@ static void mct_SetMaxLatency_D(struct DCTStatStruc *pDCTstat, u8 Channel, u8 DQ
 	Set_NB32(dev, reg, val);
 }
 
-static u8 mct_SavePassRcvEnDly_D(struct DCTStatStruc *pDCTstat,
-			u8 rcvrEnDly, u8 Channel,
-			u8 receiver, u8 Pass)
-{
-	u8 i;
-	u8 mask_Saved, mask_Pass;
-	u8 *p;
-
-	/* calculate dimm offset
-	 * not needed for CH_D_B_RCVRDLY array
-	 */
-
-	/* cmp if there has new DqsRcvEnDly to be recorded */
-	mask_Pass = pDCTstat->DqsRcvEn_Pass;
-
-	if(Pass == SecondPass) {
-		mask_Pass = ~mask_Pass;
-	}
-
-	mask_Saved = pDCTstat->DqsRcvEn_Saved;
-	if(mask_Pass != mask_Saved) {
-
-		/* find desired stack offset according to channel/dimm/byte */
-		if(Pass == SecondPass) {
-			/* FIXME: SecondPass is never used for Barcelona p = pDCTstat->CH_D_B_RCVRDLY_1[Channel][receiver>>1]; */
-			p = 0; /* Keep the compiler happy. */
-		} else {
-			mask_Saved &= mask_Pass;
-			p = pDCTstat->CH_D_B_RCVRDLY[Channel][receiver>>1];
-		}
-		for(i=0; i < 8; i++) {
-			/* cmp per byte lane */
-			if(mask_Pass & (1 << i)) {
-				if(!(mask_Saved & (1 << i))) {
-					/* save RcvEnDly to stack, according to
-					the related Dimm/byte lane */
-					p[i] = (u8)rcvrEnDly;
-					mask_Saved |= 1 << i;
-				}
-			}
-		}
-		pDCTstat->DqsRcvEn_Saved = mask_Saved;
-	}
-	return mct_SaveRcvEnDly_D_1Pass(pDCTstat, Pass);
-}
-
-static u8 mct_CompareTestPatternQW0_D(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat,
-					u32 addr, u8 channel,
-					u8 pattern, u8 Pass)
-{
-	/* Compare only the first beat of data.  Since target addrs are cache
-	 * line aligned, the Channel parameter is used to determine which
-	 * cache QW to compare.
-	 */
-
-	u8 *test_buf;
-	u8 i;
-	u8 result;
-	u8 value;
-
-	if(Pass == FirstPass) {
-		if(pattern==1) {
-			test_buf = (u8 *)TestPattern1_D;
-		} else {
-			test_buf = (u8 *)TestPattern0_D;
-		}
-	} else {		/* Second Pass */
-		test_buf = (u8 *)TestPattern2_D;
-	}
-
-	SetUpperFSbase(addr);
-	addr <<= 8;
-
-	if((pDCTstat->Status & (1<<SB_128bitmode)) && channel ) {
-		addr += 8;	/* second channel */
-		test_buf += 8;
-	}
-
-	print_debug_dqs_pair("\t\t\t\t\t\t  test_buf = ", (u32)test_buf, "  |  addr_lo = ", addr,  4);
-	for (i=0; i<8; i++, addr ++) {
-		value = read32_fs(addr);
-		print_debug_dqs_pair("\t\t\t\t\t\t\t\t ", test_buf[i], "  |  ", value, 4);
-
-		if (value == test_buf[i]) {
-			pDCTstat->DqsRcvEn_Pass |= (1<<i);
-		} else {
-			pDCTstat->DqsRcvEn_Pass &= ~(1<<i);
-		}
-	}
-
-	result = DQS_FAIL;
-
-	if (Pass == FirstPass) {
-		/* if first pass, at least one byte lane pass
-		 * ,then DQS_PASS=1 and will set to related reg.
-		 */
-		if(pDCTstat->DqsRcvEn_Pass != 0) {
-			result = DQS_PASS;
-		} else {
-			result = DQS_FAIL;
-		}
-
-	} else {
-		/* if second pass, at least one byte lane fail
-		 * ,then DQS_FAIL=1 and will set to related reg.
-		 */
-		if(pDCTstat->DqsRcvEn_Pass != 0xFF) {
-			result = DQS_FAIL;
-		} else {
-			result = DQS_PASS;
-		}
-	}
-
-	/* if second pass, we can't find the fail until FFh,
-	 * then let it fail to save the final delay
-	 */
-	if((Pass == SecondPass) && (pDCTstat->Status & (1 << SB_DQSRcvLimit))) {
-		result = DQS_FAIL;
-		pDCTstat->DqsRcvEn_Pass = 0;
-	}
-
-	/* second pass needs to be inverted
-	 * FIXME? this could be inverted in the above code to start with...
-	 */
-	if(Pass == SecondPass) {
-		if (result == DQS_PASS) {
-			result = DQS_FAIL;
-		} else if (result == DQS_FAIL) { /* FIXME: doesn't need to be else if */
-			result = DQS_PASS;
-		}
-	}
-
-
-	return result;
-}
-
 static void mct_InitDQSPos4RcvrEn_D(struct MCTStatStruc *pMCTstat,
 				struct DCTStatStruc *pDCTstat)
 {
@@ -854,7 +908,7 @@ void SetEccDQSRcvrEn_D(struct DCTStatStruc *pDCTstat, u8 Channel)
 	u32 index_reg;
 	u32 index;
 	u8 ChipSel;
-	u8 *p;
+	u16 *p;
 	u32 val;
 
 	dev = pDCTstat->dev_dct;
@@ -884,7 +938,7 @@ static void CalcEccDQSRcvrEn_D(struct MCTStatStruc *pMCTstat,
 
 	for (ChipSel = 0; ChipSel < MAX_CS_SUPPORTED; ChipSel += 2) {
 		if(mct_RcvrRankEnabled_D(pMCTstat, pDCTstat, Channel, ChipSel)) {
-			u8 *p;
+			u16 *p;
 			p = pDCTstat->CH_D_B_RCVRDLY[Channel][ChipSel>>1];
 
 			/* DQS Delay Value of Data Bytelane
@@ -920,6 +974,10 @@ static void CalcEccDQSRcvrEn_D(struct MCTStatStruc *pMCTstat,
 	SetEccDQSRcvrEn_D(pDCTstat, Channel);
 }
 
+/* 2.8.9.9.4
+ * ECC Byte Lane Training
+ * DQS Receiver Enable Delay
+ */
 void mctSetEccDQSRcvrEn_D(struct MCTStatStruc *pMCTstat,
 			struct DCTStatStruc *pDCTstatA)
 {
@@ -1017,7 +1075,9 @@ static void fenceDynTraining_D(struct MCTStatStruc *pMCTstat,
 		avRecValue -= 3;
 	else
 	*/
-	if (pDCTstat->LogicalCPUID & AMD_DR_Cx)
+	if (pDCTstat->LogicalCPUID & AMD_DR_Dx)
+		avRecValue -= 8;
+	else if (pDCTstat->LogicalCPUID & AMD_DR_Cx)
 		avRecValue -= 8;
 	else if (pDCTstat->LogicalCPUID & AMD_DR_Bx)
 		avRecValue -= 8;
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc1p.c b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc1p.c
index c009756..f01e011 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc1p.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc1p.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -36,17 +37,12 @@ u32 SetupDqsPattern_1PassB(u8 pass)
 	return (u32) TestPattern0_D;
 }
 
-u8  mct_Get_Start_RcvrEnDly_1Pass(u8 pass)
-{
-	return 0;
-}
-
-static u8 mct_Average_RcvrEnDly_1Pass(struct DCTStatStruc *pDCTstat, u8 Channel, u8 Receiver,
+static u16 mct_Average_RcvrEnDly_1Pass(struct DCTStatStruc *pDCTstat, u8 Channel, u8 Receiver,
 					u8 Pass)
 {
-	u8 i, MaxValue;
-	u8 *p;
-	u8 val;
+	u16 i, MaxValue;
+	u16 *p;
+	u16 val;
 
 	MaxValue = 0;
 	p = pDCTstat->CH_D_B_RCVRDLY[Channel][Receiver >> 1];
@@ -76,8 +72,8 @@ u8 mct_SaveRcvEnDly_D_1Pass(struct DCTStatStruc *pDCTstat, u8 pass)
 	return ret;
 }
 
-u8 mct_Average_RcvrEnDly_Pass(struct DCTStatStruc *pDCTstat,
-				u8 RcvrEnDly, u8 RcvrEnDlyLimit,
+u16 mct_Average_RcvrEnDly_Pass(struct DCTStatStruc *pDCTstat,
+				u16 RcvrEnDly, u16 RcvrEnDlyLimit,
 				u8 Channel, u8 Receiver, u8 Pass)
 
 {
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc2p.c b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc2p.c
index b01889d..796febc 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctsrc2p.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctsrc2p.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -74,15 +75,15 @@ u8 mct_Get_Start_RcvrEnDly_Pass(struct DCTStatStruc *pDCTstat,
 	return RcvrEnDly;
 }
 
-u8 mct_Average_RcvrEnDly_Pass(struct DCTStatStruc *pDCTstat,
-				u8 RcvrEnDly, u8 RcvrEnDlyLimit,
+u16 mct_Average_RcvrEnDly_Pass(struct DCTStatStruc *pDCTstat,
+				u16 RcvrEnDly, u16 RcvrEnDlyLimit,
 				u8 Channel, u8 Receiver, u8 Pass)
 {
 	u8 i;
-	u8 *p;
-	u8 *p_1;
-	u8 val;
-	u8 val_1;
+	u16 *p;
+	u16 *p_1;
+	u16 val;
+	u16 val_1;
 	u8 valid = 1;
 	u8 bn;
 
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mcttmrl.c b/src/northbridge/amd/amdmct/mct_ddr3/mcttmrl.c
index ea5c8c7..920f514 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mcttmrl.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mcttmrl.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -191,10 +192,10 @@ static void maxRdLatencyTrain_D(struct MCTStatStruc *pMCTstat,
 
 #if DQS_TRAIN_DEBUG > 0
 	{
-		u8 Channel;
+		u8 ChannelDTD;
 		printk(BIOS_DEBUG, "maxRdLatencyTrain: CH_MaxRdLat:\n");
-		for(Channel = 0; Channel<2; Channel++) {
-			printk(BIOS_DEBUG, "Channel: %02x: %02x\n", Channel, pDCTstat->CH_MaxRdLat[Channel]);
+		for(ChannelDTD = 0; ChannelDTD<2; ChannelDTD++) {
+			printk(BIOS_DEBUG, "Channel: %02x: %02x\n", ChannelDTD, pDCTstat->CH_MaxRdLat[ChannelDTD]);
 		}
 	}
 #endif
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c b/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
index cdeae49..1c3e322 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mctwl.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -58,9 +59,9 @@ void PrepareC_DCT(struct MCTStatStruc *pMCTstat,
 	pDCTstat->C_DCTPtr[dct]->LogicalCPUID = pDCTstat->LogicalCPUID;
 
 	for (dimm = 0; dimm < MAX_DIMMS; dimm++) {
-		if (DimmValid & (1 << dimm))
+		if (DimmValid & (1 << (dimm << 1)))
 			pDCTstat->C_DCTPtr[dct]->DimmPresent[dimm] = 1;
-		if (Dimmx8Present & (1 << dimm))
+		if (Dimmx8Present & (1 << (dimm << 1)))
 			pDCTstat->C_DCTPtr[dct]->DimmX8Present[dimm] = 1;
 	}
 
@@ -88,9 +89,9 @@ void PrepareC_DCT(struct MCTStatStruc *pMCTstat,
 		u8  DimmRanks;
 		if (DimmValid & (1 << (dimm << 1))) {
 			DimmRanks = 1;
-			if (pDCTstat->DimmDRPresent & (1 << (dimm+dct)))
+			if (pDCTstat->DimmDRPresent & (1 << ((dimm << 1) + dct)))
 				DimmRanks = 2;
-			else if (pDCTstat->DimmQRPresent & (1 << (dimm+dct)))
+			else if (pDCTstat->DimmQRPresent & (1 << ((dimm << 1) + dct)))
 				DimmRanks = 4;
 		} else
 			DimmRanks = 0;
@@ -249,35 +250,6 @@ static void ChangeMemClk(struct MCTStatStruc *pMCTstat,
 	}
 }
 
-/* Multiply the previously saved delay values in Pass 1, step #5 by
-   (target frequency)/400 to find the gross and fine delay initialization
-   values at the target frequency.
- */
-void MultiplyDelay(struct MCTStatStruc *pMCTstat,
-					struct DCTStatStruc *pDCTstat, u8 dct)
-{
-	u16 index;
-	u8 Multiplier;
-	u8 gross, fine;
-	u16 total;
-
-	Multiplier = pDCTstat->TargetFreq;
-
-	for (index=0; index < MAX_BYTE_LANES*MAX_LDIMMS; index ++) {
-		gross = pDCTstat->C_DCTPtr[dct]->WLGrossDelay[index];
-		fine = pDCTstat->C_DCTPtr[dct]->WLFineDelay[index];
-
-		total = gross << 5 | fine;
-		total *= Multiplier;
-		if (total % 3)
-			total = total / 3 + 1;
-		else
-			total = total / 3;
-		pDCTstat->C_DCTPtr[dct]->WLGrossDelay[index] = (total & 0xFF) >> 5;
-		pDCTstat->C_DCTPtr[dct]->WLFineDelay[index] = total & 0x1F;
-	}
-}
-
 /*
  * the DRAM controller to bring the DRAMs out of self refresh mode.
  */
@@ -352,9 +324,9 @@ void SetTargetFreq(struct MCTStatStruc *pMCTstat,
 
 		if (!DCT1Present)
 			pDCTstat->CSPresent = pDCTstat->CSPresent_DCT[0];
-		else if (pDCTstat->GangedMode) {
+		else if (pDCTstat->GangedMode)
 			pDCTstat->CSPresent = 0;
-		} else
+		else
 			pDCTstat->CSPresent = pDCTstat->CSPresent_DCT[1];
 
 		FreqChgCtrlWrd(pMCTstat, pDCTstat);
diff --git a/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c b/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c
index 212a348..67d705c 100644
--- a/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c
+++ b/src/northbridge/amd/amdmct/mct_ddr3/mhwlc_d.c
@@ -2,6 +2,7 @@
  * This file is part of the coreboot project.
  *
  * Copyright (C) 2010 Advanced Micro Devices, Inc.
+ * Copyright (C) 2015 Timothy Pearson <tpearson@raptorengineeringinc.com>, Raptor Engineering
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -235,6 +236,65 @@ u32 swapBankBits(sDCTStruct *pDCTData, u32 MRSValue)
 	return MRSValue;
 }
 
+static uint16_t unbuffered_dimm_nominal_termination_emrs(uint8_t number_of_dimms, uint8_t frequency_index, uint8_t rank_count, uint8_t rank)
+{
+	uint16_t term;
+
+	/* FIXME
+	 * Mainboards need to be able to specify the maximum number of DIMMs installable per channel
+	 * For now assume a maximum of 2 DIMMs per channel can be installed
+	 */
+	uint8_t MaxDimmsInstallable = 2;
+
+	if (number_of_dimms == 1) {
+		if (MaxDimmsInstallable < 3) {
+			term = 0x04;	/* Rtt_Nom=RZQ/4=60 Ohm */
+		} else {
+			if (rank_count == 1) {
+				term = 0x04;	/* Rtt_Nom=RZQ/4=60 Ohm */
+			} else {
+				if (rank == 0)
+					term = 0x04;	/* Rtt_Nom=RZQ/4=60 Ohm */
+				else
+					term = 0x00;	/* Rtt_Nom=OFF */
+			}
+		}
+	} else {
+		if (frequency_index < 5)
+			term = 0x0044;	/* Rtt_Nom=RZQ/6=40 Ohm */
+		else
+			term = 0x0204;	/* Rtt_Nom=RZQ/8=30 Ohm */
+	}
+
+	return term;
+}
+
+static uint16_t unbuffered_dimm_dynamic_termination_emrs(uint8_t number_of_dimms, uint8_t frequency_index, uint8_t rank_count, uint8_t rank)
+{
+	uint16_t term;
+
+	/* FIXME
+	 * Mainboards need to be able to specify the maximum number of DIMMs installable per channel
+	 * For now assume a maximum of 2 DIMMs per channel can be installed
+	 */
+	uint8_t MaxDimmsInstallable = 2;
+
+	if (number_of_dimms == 1) {
+		if (MaxDimmsInstallable < 3) {
+			term = 0x00;	/* Rtt_WR=off */
+		} else {
+			if (rank_count == 1)
+				term = 0x00;	/* Rtt_WR=off */
+			else
+				term = 0x200;	/* Rtt_WR=RZQ/4=60 Ohm */
+		}
+	} else {
+		term = 0x400;	/* Rtt_WR=RZQ/2=120 Ohm */
+	}
+
+	return term;
+}
+
 /*-----------------------------------------------------------------------------
  *  void prepareDimms(sMCTStruct *pMCTData, sDCTStruct *DCTData, u8 Dimm, BOOL WL)
  *
@@ -295,48 +355,23 @@ void prepareDimms(sMCTStruct *pMCTData, sDCTStruct *pDCTData, u8 dimm, BOOL wl)
 		if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
 			tempW1 = RttNomTargetRegDimm(pMCTData, pDCTData, dimm, wl, MemClkFreq, rank);
 		} else {
-			if (wl)
-			{
-				if (pDCTData->MaxDimmsInstalled == 1)
-				{
-					if ((pDCTData->DimmRanks[dimm] == 2) && (rank == 0))
-					{
-						tempW1 = 0x00;	/* Rtt_Nom=OFF */
-					}
+			if (wl) {
+				if (rank == 0) {
+					/* Get Rtt_WR for the current DIMM and rank */
+					uint16_t dynamic_term = unbuffered_dimm_dynamic_termination_emrs(pDCTData->MaxDimmsInstalled, MemClkFreq, pDCTData->DimmRanks[dimm], rank);
+
+					/* Convert dynamic termination code to corresponding nominal termination code */
+					if (dynamic_term == 0x200)
+						tempW1 = 0x04;
+					else if (dynamic_term == 0x400)
+						tempW1 = 0x40;
 					else
-					{
-						tempW1 = 0x04;	/* Rtt_Nom=RZQ/4=60 Ohm */
-					}
-				}
-				else	/* 2 Dimms or more per channel */
-				{
-					if ((pDCTData->DimmRanks[dimm] == 2) && (rank == 1))
-					{
-						tempW1 = 0x00;	/* Rtt_Nom=OFF */
-					}
-					else
-					{
-						if (MemClkFreq == 6) {
-							tempW1 = 0x04;	/* Rtt_Nom=RZQ/4=60 Ohm */
-						} else {
-							tempW1 = 0x40;/* Rtt_Nom=RZQ/2=120 Ohm */
-						}
-					}
-				}
-			}
-			else {	/* 1 or 4 Dimms per channel */
-				if ((pDCTData->MaxDimmsInstalled == 1) || (pDCTData->MaxDimmsInstalled == 4))
-				{
-					tempW1 = 0x04;	/* Rtt_Nom=RZQ/4=60 Ohm */
-				}
-				else	/* 2 or 3 Dimms per channel */
-				{
-					if (MemClkFreq < 5) {
-						tempW1 = 0x0044;	/* Rtt_Nom=RZQ/6=40 Ohm */
-					} else {
-						tempW1 = 0x0204;	/* Rtt_Nom=RZQ/8=30 Ohm */
-					}
+						tempW1 = 0x0;
+				} else {
+					tempW1 = unbuffered_dimm_nominal_termination_emrs(pDCTData->MaxDimmsInstalled, MemClkFreq, pDCTData->DimmRanks[dimm], rank);
 				}
+			} else {
+				tempW1 = unbuffered_dimm_nominal_termination_emrs(pDCTData->MaxDimmsInstalled, MemClkFreq, pDCTData->DimmRanks[dimm], rank);
 			}
 		}
 		tempW=tempW|tempW1;
@@ -353,20 +388,22 @@ void prepareDimms(sMCTStruct *pMCTData, sDCTStruct *pDCTData, u8 dimm, BOOL wl)
 			else
 			{
 				/* Disable the output drivers of all other ranks for
-				 * the target DIMM. */
+				 * the target DIMM.
+				 */
 				tempW = bitTestSet(tempW1, Qoff);
 			}
 		}
-		/* program MrsAddress[5,1]=output driver impedance control (DIC):
-		 * based on F2x[1,0]84[DrvImpCtrl] */
+		/* Program MrsAddress[5,1]=output driver impedance control (DIC):
+		 * based on F2x[1,0]84[DrvImpCtrl]
+		 */
 		tempW1 = get_Bits(pDCTData, pDCTData->CurrDct, pDCTData->NodeId,
 				FUN_DCT, DRAM_MRS_REGISTER, DrvImpCtrlStart, DrvImpCtrlEnd);
-		if (bitTest(tempW1,1))
-		{tempW = bitTestSet(tempW, 5);}
-		if (bitTest(tempW1,0))
-		{tempW = bitTestSet(tempW, 1);}
+		if (bitTest(tempW1, 1))
+			tempW = bitTestSet(tempW, 5);
+		if (bitTest(tempW1, 0))
+			tempW = bitTestSet(tempW, 1);
 
-		tempW = swapAddrBits_wl(pDCTData,tempW);
+		tempW = swapAddrBits_wl(pDCTData, tempW);
 
 		set_Bits(pDCTData, pDCTData->CurrDct, pDCTData->NodeId, FUN_DCT,
 			DRAM_INIT, MrsAddressStart, MrsAddressEnd, tempW);
@@ -404,29 +441,10 @@ void prepareDimms(sMCTStruct *pMCTData, sDCTStruct *pDCTData, u8 dimm, BOOL wl)
 		if ((pDCTData->LogicalCPUID & AMD_DR_Bx) && (pDCTData->Status[DCT_STATUS_REGISTERED]))
 			tempW+=0x8;
 		/* determine Rtt_WR for WL & Normal mode */
-		if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
+		if (pDCTData->Status[DCT_STATUS_REGISTERED])
 			tempW1 = RttWrRegDimm(pMCTData, pDCTData, dimm, wl, MemClkFreq, rank);
-		} else {
-			if (wl)
-			{
-				tempW1 = 0x00;	/* Rtt_WR=off */
-			}
-			else
-			{
-				if (pDCTData->MaxDimmsInstalled == 1)
-				{
-					tempW1 = 0x00;	/* Rtt_WR=off */
-				}
-				else
-				{
-					if (MemClkFreq == 6) {
-						tempW1 = 0x200;	/* Rtt_WR=RZQ/4=60 Ohm */
-					} else {
-						tempW1 = 0x400;	/* Rtt_WR=RZQ/2 */
-					}
-				}
-			}
-		}
+		else
+			tempW1 = unbuffered_dimm_dynamic_termination_emrs(pDCTData->MaxDimmsInstalled, MemClkFreq, pDCTData->DimmRanks[dimm], rank);
 		tempW=tempW|tempW1;
 		tempW = swapAddrBits_wl(pDCTData,tempW);
 		set_Bits(pDCTData, pDCTData->CurrDct, pDCTData->NodeId, FUN_DCT,
@@ -483,38 +501,10 @@ void prepareDimms(sMCTStruct *pMCTData, sDCTStruct *pDCTData, u8 dimm, BOOL wl)
 					}
 
 					/* determine Rtt_Nom for WL & Normal mode */
-					if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
+					if (pDCTData->Status[DCT_STATUS_REGISTERED])
 						tempW1 = RttNomNonTargetRegDimm(pMCTData, pDCTData, currDimm, wl, MemClkFreq, rank);
-					} else {
-						if (wl)
-						{
-							if ((pDCTData->DimmRanks[currDimm] == 2) && (rank == 1))
-							{
-								tempW1 = 0x00;	/* Rtt_Nom=OFF */
-							}
-							else
-							{
-								if (MemClkFreq < 5) {
-									tempW1 = 0x0044;/* Rtt_Nom=RZQ/6=40 Ohm */
-								} else {
-									tempW1 = 0x0204;/* Rtt_Nom=RZQ/8=30 Ohm */
-								}
-							}
-						}
-						else {	/* 1 or 4 Dimms per channel */
-							if (pDCTData->MaxDimmsInstalled == 4)
-							{
-								tempW1 = 0x04;	/* Rtt_Nom=RZQ/4=60 Ohm */
-							}
-							else {	/* 2 or 3 Dimms per channel */
-								if (MemClkFreq < 5) {
-									tempW1 = 0x0044;	/* Rtt_Nom=RZQ/6=40 Ohm */
-								} else {
-									tempW1 = 0x0204;	/* Rtt_Nom=RZQ/8=30 Ohm */
-								}
-							}
-						}
-					}
+					else
+						tempW1 = unbuffered_dimm_nominal_termination_emrs(pDCTData->MaxDimmsInstalled, MemClkFreq, pDCTData->DimmRanks[currDimm], rank);
 					tempW=tempW|tempW1;
 					/* program MrsAddress[5,1]=output driver impedance control (DIC):
 					 * based on F2x[1,0]84[DrvImpCtrl] */
@@ -560,22 +550,10 @@ void prepareDimms(sMCTStruct *pMCTData, sDCTStruct *pDCTData, u8 dimm, BOOL wl)
 					if ((pDCTData->LogicalCPUID & AMD_DR_Bx) && (pDCTData->Status[DCT_STATUS_REGISTERED]))
 						tempW+=0x8;
 					/* determine Rtt_WR for WL & Normal mode */
-					if (pDCTData->Status[DCT_STATUS_REGISTERED]) {
+					if (pDCTData->Status[DCT_STATUS_REGISTERED])
 						tempW1 = RttWrRegDimm(pMCTData, pDCTData, currDimm, wl, MemClkFreq, rank);
-					} else {
-						if (wl)
-						{
-							tempW1 = 0x00;	/* Rtt_WR=off */
-						}
-						else
-						{
-							if (MemClkFreq == 6) {
-								tempW1 = 0x200;	/* Rtt_WR=RZQ/4=60 Ohm */
-							} else {
-								tempW1 = 0x400;	/* Rtt_WR=RZQ/2 */
-							}
-						}
-					}
+					else
+						tempW1 = unbuffered_dimm_dynamic_termination_emrs(pDCTData->MaxDimmsInstalled, MemClkFreq, pDCTData->DimmRanks[currDimm], rank);
 					tempW=tempW|tempW1;
 					tempW = swapAddrBits_wl(pDCTData,tempW);
 					set_Bits(pDCTData, pDCTData->CurrDct, pDCTData->NodeId, FUN_DCT,
@@ -646,9 +624,14 @@ void programODT(sMCTStruct *pMCTData, sDCTStruct *pDCTData, u8 dimm)
  */
 void procConifg(sMCTStruct *pMCTData,sDCTStruct *pDCTData, u8 dimm, u8 pass)
 {
-	u8 ByteLane, Seed_Gross, Seed_Fine;
+	u8 ByteLane, Seed_Gross, Seed_Fine, MemClkFreq;
 	u32 Value, Addr;
 	u16 Addl_Data_Offset, Addl_Data_Port;
+	u16 freq_tab[] = {400, 533, 667, 800};
+
+	/* MemClkFreq: 3: 400MHz; 4: 533MHz; 5: 667MHz; 6: 800MHz */
+	MemClkFreq = get_Bits(pDCTData, pDCTData->CurrDct, pDCTData->NodeId,
+				FUN_DCT, DRAM_CONFIG_HIGH, 0, 2);
 
 	/* Program F2x[1, 0]9C_x08[WrLvOdt[3:0]] to the proper ODT settings for the
 	 * current memory subsystem configuration.
@@ -656,12 +639,13 @@ void procConifg(sMCTStruct *pMCTData,sDCTStruct *pDCTData, u8 dimm, u8 pass)
 	programODT(pMCTData, pDCTData, dimm);
 
 	/* Program F2x[1,0]9C_x08[WrLvOdtEn]=1 */
-	if (pDCTData->LogicalCPUID & (AMD_DR_Cx | AMD_DR_Dx))
+	if (pDCTData->LogicalCPUID & (AMD_DR_Cx | AMD_DR_Dx)) {
 		set_DCT_ADDR_Bits(pDCTData, pDCTData->DctTrain, pDCTData->NodeId, FUN_DCT,
 				DRAM_ADD_DCT_PHY_CONTROL_REG, WrLvOdtEn, WrLvOdtEn, (u32)1);
+	}
 	else
 	{
-		/* Program WrLvOdtEn=1 through set bit 12 of D3CSODT reg offset 0 for Rev.B*/
+		/* Program WrLvOdtEn=1 through set bit 12 of D3CSODT reg offset 0 for Rev.B */
 		if (pDCTData->DctTrain)
 		{
 			Addl_Data_Offset=0x198;
@@ -687,7 +671,6 @@ void procConifg(sMCTStruct *pMCTData,sDCTStruct *pDCTData, u8 dimm, u8 pass)
 
 	/* Wait 10 MEMCLKs to allow for ODT signal settling. */
 	pMCTData->AgesaDelay(10);
-	ByteLane = 0;
 	if (pass == 1)
 	{
 		if (pDCTData->Status[DCT_STATUS_REGISTERED])
@@ -705,10 +688,17 @@ void procConifg(sMCTStruct *pMCTData,sDCTStruct *pDCTData, u8 dimm, u8 pass)
 		}
 		else
 		{
-			Seed_Gross = 0x00;
-			Seed_Fine = 0x1A;
+			if (MemClkFreq == 6) {
+				/* DDR-800 */
+				Seed_Gross = 0x00;
+				Seed_Fine = 0x1a;
+			} else {
+				/* Use settings for DDR-400 (interpolated from BKDG) */
+				Seed_Gross = 0x00;
+				Seed_Fine = 0x0d;
+			}
 		}
-		while(ByteLane < MAX_BYTE_LANES)
+		for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++)
 		{
 			/* Program an initialization value to registers F2x[1, 0]9C_x[51:50] and
 			 * F2x[1, 0]9C_x52 to set the gross and fine delay for all the byte lane fields
@@ -720,35 +710,32 @@ void procConifg(sMCTStruct *pMCTData,sDCTStruct *pDCTData, u8 dimm, u8 pass)
 			 */
 			pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Gross;
 			pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
-			ByteLane++;
 		}
-	} else if (pDCTData->Status[DCT_STATUS_REGISTERED]) {		/* For Pass 2 */
+	} else { 		/* Pass 2 */
 		/* From BKDG, Write Leveling Seed Value. */
-		/* TODO: The unbuffered DIMMs are unstable on the code below. So temporarily it is
-		 * only for registered DIMMs. */
 		u32 RegisterDelay, SeedTotal;
-		u8 MemClkFreq;
-		u16 freq_tab[] = {400, 533, 667, 800};
-		while(ByteLane < MAX_BYTE_LANES)
+		for (ByteLane = 0; ByteLane < MAX_BYTE_LANES; ByteLane++)
 		{
-			MemClkFreq = get_Bits(pDCTData, pDCTData->CurrDct, pDCTData->NodeId,
-					      FUN_DCT, DRAM_CONFIG_HIGH, 0, 2);
 			if (pDCTData->Status[DCT_STATUS_REGISTERED])
 				RegisterDelay = 0x20; /* TODO: ((RCW2 & BIT0) == 0) ? 0x20 : 0x30; */
 			else
 				RegisterDelay = 0;
-			SeedTotal = (pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1F) |
-				pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] << 5;
+			SeedTotal = (pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] & 0x1f) |
+				(pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] << 5);
 			/* SeedTotalPreScaling = (the total delay value in F2x[1, 0]9C_x[4A:30] from pass 1 of write levelization
 			   training) - RegisterDelay. */
-			/* MemClkFreq: 3: 400MHz; 4: 533MHz; 5: 667MHz; 6: 800MHz */
-			SeedTotal = (u16) (RegisterDelay + ((((u32) SeedTotal - RegisterDelay) *
-							     freq_tab[MemClkFreq-3]) / 400));
-			Seed_Gross = (SeedTotal & 0x20) != 0 ? 1 : 2;
-			Seed_Fine = SeedTotal & 0x1F;
+			SeedTotal = (uint16_t) (RegisterDelay + ((((uint64_t) SeedTotal - RegisterDelay) *
+								freq_tab[MemClkFreq-3] * 100) / (freq_tab[0] * 100)));
+			Seed_Gross = SeedTotal / 32;
+			Seed_Fine = SeedTotal & 0x1f;
+			if (Seed_Gross == 0)
+				Seed_Gross = 0;
+			else if (Seed_Gross & 0x1)
+				Seed_Gross = 1;
+			else
+				Seed_Gross = 2;
 			pDCTData->WLGrossDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Gross;
 			pDCTData->WLFineDelay[MAX_BYTE_LANES*dimm+ByteLane] = Seed_Fine;
-			ByteLane ++;
 		}
 	}
 
diff --git a/src/northbridge/amd/amdmct/wrappers/mcti_d.c b/src/northbridge/amd/amdmct/wrappers/mcti_d.c
index ea32893..c00cf24 100644
--- a/src/northbridge/amd/amdmct/wrappers/mcti_d.c
+++ b/src/northbridge/amd/amdmct/wrappers/mcti_d.c
@@ -59,6 +59,10 @@ static u16 mctGet_NVbits(u8 index)
 		val = 1;
 #elif CONFIG_CPU_SOCKET_TYPE == 0x13	/* ASB2 */
 		val = 4;
+#elif CONFIG_CPU_SOCKET_TYPE == 0x14	/* C32 */
+		val = 5;
+#elif CONFIG_CPU_SOCKET_TYPE == 0x15	/* G34 */
+		val = 3;
 //#elif SYSTEM_TYPE == MOBILE
 //		val = 2;
 #endif
@@ -297,6 +301,8 @@ static void mctGet_MaxLoadFreq(struct DCTStatStruc *pDCTstat)
 	/* Determine the number of installed DIMMs */
 	int ch1_count = 0;
 	int ch2_count = 0;
+	uint8_t ch1_registered = 0;
+	uint8_t ch2_registered = 0;
 	int i;
 	for (i = 0; i < 15; i = i + 2) {
 		if (pDCTstat->DIMMValid & (1 << i))
@@ -304,6 +310,12 @@ static void mctGet_MaxLoadFreq(struct DCTStatStruc *pDCTstat)
 		if (pDCTstat->DIMMValid & (1 << (i + 1)))
 			ch2_count++;
 	}
+	for (i = 0; i < MAX_DIMMS_SUPPORTED; i = i + 2) {
+		if (pDCTstat->DimmRegistered[i])
+			ch1_registered = 1;
+		if (pDCTstat->DimmRegistered[i + 1])
+			ch2_registered = 1;
+	}
 	if (IS_ENABLED(CONFIG_DEBUG_RAM_SETUP)) {
 		printk(BIOS_DEBUG, "mctGet_MaxLoadFreq: Channel 1: %d DIMM(s) detected\n", ch1_count);
 		printk(BIOS_DEBUG, "mctGet_MaxLoadFreq: Channel 2: %d DIMM(s) detected\n", ch2_count);
@@ -413,101 +425,6 @@ static void mctHookAfterDramInit(void)
 }
 
 #if (CONFIG_DIMM_SUPPORT & 0x000F)==0x0005 /* AMD_FAM10_DDR3 */
-static void coreDelay(u32 microseconds)
-{
-	msr_t now;
-	msr_t end;
-	u32 cycles;
-
-	/* delay ~40us
-	   This seems like a hack to me...
-	   It would be nice to have a central delay function. */
-
-	cycles = (microseconds * 100) << 3;  /* x8 (number of 1.25ns ticks) */
-
-        if (!(rdmsr(HWCR).lo & TSC_FREQ_SEL_MASK)) {
-            msr_t pstate_msr = rdmsr(CUR_PSTATE_MSR);
-            if (!(rdmsr(0xC0010064+pstate_msr.lo).lo & NB_DID_M_ON)) {
-	      cycles = cycles <<1; // half freq, double cycles
-	    }
-	} // else should we keep p0 freq at the time of setting TSC_FREQ_SEL_MASK somewhere and check it here ?
-
-	now = rdmsr(TSC_MSR);
-        // avoid overflow when called near 2^32 ticks ~ 5.3 s boundaries
-	if (0xffffffff - cycles >= now.lo ) {
-	  end.hi =  now.hi;
-          end.lo = now.lo + cycles;
-	} else {
-          end.hi = now.hi +1; //
-          end.lo = cycles - (1+(0xffffffff - now.lo));
-	}
-	do {
-          now = rdmsr(TSC_MSR);
-        } while ((now.hi < end.hi) || ((now.hi == end.hi) && (now.lo < end.lo)));
-}
-
-/* Erratum 350 */
-static void vErrata350(struct MCTStatStruc *pMCTstat, struct DCTStatStruc *pDCTstat)
-{
-	u8 u8Channel;
-	u8 u8Receiver;
-	u32 u32Addr;
-	u8 u8Valid;
-	u32 u32DctDev;
-
-	// 1. dummy read for each installed DIMM */
-	for (u8Channel = 0; u8Channel < 2; u8Channel++) {
-		// This will be 0 for vaild DIMMS, eles 8
-		u8Receiver = mct_InitReceiver_D(pDCTstat, u8Channel);
-
-		for (; u8Receiver < 8; u8Receiver += 2) {
-			u32Addr = mct_GetRcvrSysAddr_D(pMCTstat, pDCTstat, u8Channel, u8Receiver, &u8Valid);
-
-			if(!u8Valid) {	/* Address not supported on current CS */
-				print_t("vErrata350: Address not supported on current CS\n");
-				continue;
-			}
-			print_t("vErrata350: dummy read \n");
-			read32_fs(u32Addr);
-		}
-	}
-
-	print_t("vErrata350: step 2a\n");
-
-	/* 2. Write 0000_8000h to register F2x[1, 0]9C_xD080F0C. */
-	u32DctDev = pDCTstat->dev_dct;
-	Set_NB32_index_wait(u32DctDev, 0x098, 0xD080F0C, 0x00008000);
-	/*                                                ^--- value
-	                                        ^---F2x[1, 0]9C_x0D080F0C, No description in BKDG.
-	                                 ^----F2x[1, 0]98 DRAM Controller Additional Data Offset Register */
-
-	if(!pDCTstat->GangedMode) {
-		print_t("vErrata350: step 2b\n");
-		Set_NB32_index_wait(u32DctDev, 0x198, 0xD080F0C, 0x00008000);
-		/*                                                ^--- value
-		                                        ^---F2x[1, 0]9C_x0D080F0C, No description in BKDG
-		                                ^----F2x[1, 0]98 DRAM Controller Additional Data Offset Register */
-	}
-
-	print_t("vErrata350: step 3\n");
-	/* 3. Wait at least 300 nanoseconds. */
-	coreDelay(1);
-
-	print_t("vErrata350: step 4\n");
-	/* 4. Write 0000_0000h to register F2x[1, 0]9C_xD080F0C. */
-	Set_NB32_index_wait(u32DctDev, 0x098, 0xD080F0C, 0x00000000);
-
-	if(!pDCTstat->GangedMode) {
-		print_t("vErrata350: step 4b\n");
-		Set_NB32_index_wait(u32DctDev, 0x198, 0xD080F0C, 0x00000000);
-	}
-
-	print_t("vErrata350: step 5\n");
-	/* 5. Wait at least 2 microseconds. */
-	coreDelay(2);
-
-}
-
 static void vErratum372(struct DCTStatStruc *pDCTstat)
 {
         msr_t msr = rdmsr(NB_CFG_MSR);
@@ -546,8 +463,7 @@ static void mctHookBeforeAnyTraining(struct MCTStatStruc *pMCTstat, struct DCTSt
 {
 #if (CONFIG_DIMM_SUPPORT & 0x000F)==0x0005 /* AMD_FAM10_DDR3 */
   /* FIXME :  as of 25.6.2010 errata 350 and 372 should apply to  ((RB|BL|DA)-C[23])|(HY-D[01])|(PH-E0) but I don't find constants for all of them */
-	if (pDCTstatA->LogicalCPUID & AMD_DRBH_Cx) {
-		vErrata350(pMCTstat, pDCTstatA);
+	if (pDCTstatA->LogicalCPUID & (AMD_DRBH_Cx | AMD_DR_Dx)) {
 		vErratum372(pDCTstatA);
 		vErratum414(pDCTstatA);
 	}
-- 
1.7.9.5

